Class: Wingalingding::Scraper

Inherits:
Object
  • Object
show all
Defined in:
lib/scraper.rb

Constant Summary collapse

SOURCE =
"http://www.fileformat.info/info/charset/UTF-8/list.htm?start=%s"
CSS_SELECTOR =
"table.list tr"
CHARACTER_TABLE_DIRECTORY =
File.join( File.dirname(__FILE__), "../charsets")
COMPLETE_CHARACTER_TABLE =
"utf8.txt"
OUTPUT_FILENAME =
File.join(CHARACTER_TABLE_DIRECTORY, "chars.txt")
LOG =
File.join( File.dirname(__FILE__), "../log/scrape.log")
KEEP_PARTIAL_FILES =
true

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(start = 0) ⇒ Scraper

Returns a new instance of Scraper.



18
19
20
21
22
23
# File 'lib/scraper.rb', line 18

def initialize(start=0)
  @start = start
  @next  = @start
  FileUtils.mkdir_p CHARACTER_TABLE_DIRECTORY
  @output_file = File.open(OUTPUT_FILENAME,"w+")
end

Class Method Details

.character_mapObject



100
101
102
# File 'lib/scraper.rb', line 100

def self.character_map
  File.readlines( File.join(CHARACTER_TABLE_DIRECTORY,COMPLETE_CHARACTER_TABLE) )
end

Instance Method Details

#already_recordedObject



71
72
73
# File 'lib/scraper.rb', line 71

def already_recorded
  character_files.select{|file| file =~ /chars_#{"%05i" % @start.to_i}\-[0-9]+\.txt$/ }.first
end

#character_filesObject



34
35
36
# File 'lib/scraper.rb', line 34

def character_files
  Dir.entries(CHARACTER_TABLE_DIRECTORY).map{|f| f =~ /^chars.*[0-9]+.*\.txt$/ ? File.join(CHARACTER_TABLE_DIRECTORY,f) : nil}.compact
end

#character_tableObject



48
49
50
# File 'lib/scraper.rb', line 48

def character_table
  doc.css(CSS_SELECTOR).map{|node| node.css("td").map(&:content).join("\t") }
end

#cleanupObject



105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/scraper.rb', line 105

def cleanup
  log "Cleaning up files"
  final_charset_filename = File.join(CHARACTER_TABLE_DIRECTORY,COMPLETE_CHARACTER_TABLE)
  final_charset = File.open(final_charset_filename, "w+" )
  Dir.entries(CHARACTER_TABLE_DIRECTORY).select{|f| f =~ /^chars_/ }.each do |f|
    segment =  File.join(CHARACTER_TABLE_DIRECTORY, f)
    final_charset.puts File.read(segment)
  end
  final_charset.close
  delete_partials unless KEEP_PARTIAL_FILES
  entries = File.read(final_charset_filename).split(/\n|\r/).map{|l| l.strip.gsub('\n','')}.reject{|line| line.length == 0 || line =~ /^(\n|\r)+$/ }.uniq
  final_charset = File.open(final_charset_filename, "w+" )
  final_charset.puts entries.join("\n")
  final_charset.close
  log "Final character map contains:" +  File.readlines(final_charset_filename).size.to_s + " files"
end

#collect_characters(character_list) ⇒ Object



52
53
54
55
56
57
58
# File 'lib/scraper.rb', line 52

def collect_characters(character_list)
  @charset = character_list.reject{|line| line =~ /^More\.+$|^\s*$/i }
  log "character list: #{@charset.size} - #{@charset[0]}"
  @next   += @charset.size
  log "new range is #{@start} - #{@next}"
  @content = @charset.join("\n")
end

#delete_partialsObject



95
96
97
98
# File 'lib/scraper.rb', line 95

def delete_partials
  log "Deleting partials"
  partials.each{|f| File.delete f }
end

#docObject



43
44
45
46
# File 'lib/scraper.rb', line 43

def doc
  log "reading doc #{url}"
  Nokogiri::HTML(open(url))
end

#log(message) ⇒ Object



25
26
27
28
# File 'lib/scraper.rb', line 25

def log(message)
  puts message
  `echo '[#{Time.now.to_s}] #{message}' >> #{LOG}`
end

#partial_filename(start = nil, last = nil) ⇒ Object



60
61
62
63
64
# File 'lib/scraper.rb', line 60

def partial_filename(start=nil, last=nil)
  start ||= @start
  last ||= @last
  File.join(CHARACTER_TABLE_DIRECTORY, "chars_#{"%05i" % start.to_i}-#{"%05i" % last.to_i}.txt")
end

#partialsObject



91
92
93
# File 'lib/scraper.rb', line 91

def partials
  Dir.entries(CHARACTER_TABLE_DIRECTORY).select{|f| f =~ /^chars_/ }.map{|f| File.join(CHARACTER_TABLE_DIRECTORY, f) }
end

#record_charactersObject



66
67
68
69
# File 'lib/scraper.rb', line 66

def record_characters
  log "Creating new file"
  f = File.open(partial_filename(@start, @next),"w+");f.puts @content;f.close
end

#retrieveObject



122
123
124
125
# File 'lib/scraper.rb', line 122

def retrieve
  scrape
  cleanup
end

#scrapeObject



76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/scraper.rb', line 76

def scrape
  log "Starting scrape"
  while (@next == 0 ) || (@next > 0 && @charset.size > 0) do
    if !already_recorded.nil?
      log "Already recorded (File exists: #{already_recorded})"
      collect_characters File.read(already_recorded).split("\n")
    else 
      collect_characters character_table
      record_characters
    end
    @start   = @next
  end
  @output_file.close
end

#supposed_leap(start = nil) ⇒ Object



38
39
40
41
# File 'lib/scraper.rb', line 38

def supposed_leap(start=nil)
  start ||= @start
  File.readlines(associated_file(start)).size
end

#urlObject



30
31
32
# File 'lib/scraper.rb', line 30

def url
  SOURCE % @start.to_i.to_s
end