module Linguist::Samples

Model for accessing classifier training data.

Constants

PATH

Path for serialized samples db

ROOT

Path to samples root directory

Public Class Methods

cache() click to toggle source

Hash of serialized samples object

# File lib/linguist/samples.rb, line 21
def self.cache
  @cache ||= begin
    serializer = defined?(Yajl) ? Yajl : YAML
    serializer.load(File.read(PATH, encoding: 'utf-8'))
  end
end
data() click to toggle source

Public: Build Classifier from all samples.

Returns trained Classifier.

# File lib/linguist/samples.rb, line 71
def self.data
  db = {}
  db['extnames'] = {}
  db['interpreters'] = {}
  db['filenames'] = {}

  each do |sample|
    language_name = sample[:language]

    if sample[:extname]
      db['extnames'][language_name] ||= []
      if !db['extnames'][language_name].include?(sample[:extname])
        db['extnames'][language_name] << sample[:extname]
        db['extnames'][language_name].sort!
      end
    end

    if sample[:interpreter]
      db['interpreters'][language_name] ||= []
      if !db['interpreters'][language_name].include?(sample[:interpreter])
        db['interpreters'][language_name] << sample[:interpreter]
        db['interpreters'][language_name].sort!
      end
    end

    if sample[:filename]
      db['filenames'][language_name] ||= []
      db['filenames'][language_name] << sample[:filename]
      db['filenames'][language_name].sort!
    end

    data = File.read(sample[:path])
    Classifier.train!(db, language_name, data)
  end

  db['md5'] = Linguist::MD5.hexdigest(db)

  db
end
each() { |{ :path => join, :language => category, :filename => subfilename }| ... } click to toggle source

Public: Iterate over each sample.

&block - Yields Sample to block

Returns nothing.

# File lib/linguist/samples.rb, line 33
def self.each(&block)
  Dir.entries(ROOT).sort!.each do |category|
    next if category == '.' || category == '..'

    dirname = File.join(ROOT, category)
    Dir.entries(dirname).each do |filename|
      next if filename == '.' || filename == '..'

      if filename == 'filenames'
        Dir.entries(File.join(dirname, filename)).each do |subfilename|
          next if subfilename == '.' || subfilename == '..'

          yield({
            :path    => File.join(dirname, filename, subfilename),
            :language => category,
            :filename => subfilename
          })
        end
      else
        path = File.join(dirname, filename)
        extname = File.extname(filename)

        yield({
          :path     => path,
          :language => category,
          :interpreter => Shebang.interpreter(File.read(path)),
          :extname  => extname.empty? ? nil : extname
        })
      end
    end
  end

  nil
end