Module: Linguist::Samples

Defined in:
lib/linguist/samples.rb

Overview

Model for accessing classifier training data.

Constant Summary collapse

ROOT =

Path to samples root directory

File.expand_path("../../../samples", __FILE__)
PATH =

Path for serialized samples db

File.expand_path('../samples.json', __FILE__)
DATA =
YAML.load_file(PATH)

Class Method Summary collapse

Class Method Details

.dataObject

Public: Build Classifier from all samples.

Returns trained Classifier.



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File 'lib/linguist/samples.rb', line 67

def self.data
  db = {}
  db['extnames'] = {}
  db['filenames'] = {}

  each do |sample|
    language_name = sample[:language]

    if sample[:extname]
      db['extnames'][language_name] ||= []
      if !db['extnames'][language_name].include?(sample[:extname])
        db['extnames'][language_name] << sample[:extname]
        db['extnames'][language_name].sort!
      end
    end

    if sample[:filename]
      db['filenames'][language_name] ||= []
      db['filenames'][language_name] << sample[:filename]
      db['filenames'][language_name].sort!
    end

    data = File.read(sample[:path])
    Classifier.train!(db, language_name, data)
  end

  db['md5'] = Linguist::MD5.hexdigest(db)

  db
end

.each(&block) ⇒ Object

Public: Iterate over each sample.

&block - Yields Sample to block

Returns nothing.



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/linguist/samples.rb', line 25

def self.each(&block)
  Dir.entries(ROOT).each do |category|
    next if category == '.' || category == '..'

    # Skip text and binary for now
    # Possibly reconsider this later
    next if category == 'Text' || category == 'Binary'

    dirname = File.join(ROOT, category)
    Dir.entries(dirname).each do |filename|
      next if filename == '.' || filename == '..'

      if filename == 'filenames'
        Dir.entries(File.join(dirname, filename)).each do |subfilename|
          next if subfilename == '.' || subfilename == '..'

          yield({
            :path    => File.join(dirname, filename, subfilename),
            :language => category,
            :filename => subfilename
          })
        end
      else
        if File.extname(filename) == ""
          raise "#{File.join(dirname, filename)} is missing an extension, maybe it belongs in filenames/ subdir"
        end

        yield({
          :path     => File.join(dirname, filename),
          :language => category,
          :extname  => File.extname(filename)
        })
      end
    end
  end

  nil
end