Class: Wingalingding::Scraper
- Inherits:
-
Object
- Object
- Wingalingding::Scraper
- Defined in:
- lib/scraper.rb
Constant Summary collapse
- SOURCE =
"http://www.fileformat.info/info/charset/UTF-8/list.htm?start=%s"
- CSS_SELECTOR =
"table.list tr"
- CHARACTER_TABLE_DIRECTORY =
File.join( File.dirname(__FILE__), "../charsets")
- COMPLETE_CHARACTER_TABLE =
"utf8.txt"
- OUTPUT_FILENAME =
File.join(CHARACTER_TABLE_DIRECTORY, "chars.txt")
- LOG =
File.join( File.dirname(__FILE__), "../log/scrape.log")
- KEEP_PARTIAL_FILES =
true
Class Method Summary collapse
Instance Method Summary collapse
- #already_recorded ⇒ Object
- #character_files ⇒ Object
- #character_table ⇒ Object
- #cleanup ⇒ Object
- #collect_characters(character_list) ⇒ Object
- #delete_partials ⇒ Object
- #doc ⇒ Object
-
#initialize(start = 0) ⇒ Scraper
constructor
A new instance of Scraper.
- #log(message) ⇒ Object
- #partial_filename(start = nil, last = nil) ⇒ Object
- #partials ⇒ Object
- #record_characters ⇒ Object
- #retrieve ⇒ Object
- #scrape ⇒ Object
- #supposed_leap(start = nil) ⇒ Object
- #url ⇒ Object
Constructor Details
#initialize(start = 0) ⇒ Scraper
Returns a new instance of Scraper.
18 19 20 21 22 23 |
# File 'lib/scraper.rb', line 18 def initialize(start=0) @start = start @next = @start FileUtils.mkdir_p CHARACTER_TABLE_DIRECTORY @output_file = File.open(OUTPUT_FILENAME,"w+") end |
Class Method Details
.character_map ⇒ Object
100 101 102 |
# File 'lib/scraper.rb', line 100 def self.character_map File.readlines( File.join(CHARACTER_TABLE_DIRECTORY,COMPLETE_CHARACTER_TABLE) ) end |
Instance Method Details
#already_recorded ⇒ Object
71 72 73 |
# File 'lib/scraper.rb', line 71 def already_recorded character_files.select{|file| file =~ /chars_#{"%05i" % @start.to_i}\-[0-9]+\.txt$/ }.first end |
#character_files ⇒ Object
34 35 36 |
# File 'lib/scraper.rb', line 34 def character_files Dir.entries(CHARACTER_TABLE_DIRECTORY).map{|f| f =~ /^chars.*[0-9]+.*\.txt$/ ? File.join(CHARACTER_TABLE_DIRECTORY,f) : nil}.compact end |
#character_table ⇒ Object
48 49 50 |
# File 'lib/scraper.rb', line 48 def character_table doc.css(CSS_SELECTOR).map{|node| node.css("td").map(&:content).join("\t") } end |
#cleanup ⇒ Object
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 |
# File 'lib/scraper.rb', line 105 def cleanup log "Cleaning up files" final_charset_filename = File.join(CHARACTER_TABLE_DIRECTORY,COMPLETE_CHARACTER_TABLE) final_charset = File.open(final_charset_filename, "w+" ) Dir.entries(CHARACTER_TABLE_DIRECTORY).select{|f| f =~ /^chars_/ }.each do |f| segment = File.join(CHARACTER_TABLE_DIRECTORY, f) final_charset.puts File.read(segment) end final_charset.close delete_partials unless KEEP_PARTIAL_FILES entries = File.read(final_charset_filename).split(/\n|\r/).map{|l| l.strip.gsub('\n','')}.reject{|line| line.length == 0 || line =~ /^(\n|\r)+$/ }.uniq final_charset = File.open(final_charset_filename, "w+" ) final_charset.puts entries.join("\n") final_charset.close log "Final character map contains:" + File.readlines(final_charset_filename).size.to_s + " files" end |
#collect_characters(character_list) ⇒ Object
52 53 54 55 56 57 58 |
# File 'lib/scraper.rb', line 52 def collect_characters(character_list) @charset = character_list.reject{|line| line =~ /^More\.+$|^\s*$/i } log "character list: #{@charset.size} - #{@charset[0]}" @next += @charset.size log "new range is #{@start} - #{@next}" @content = @charset.join("\n") end |
#delete_partials ⇒ Object
95 96 97 98 |
# File 'lib/scraper.rb', line 95 def delete_partials log "Deleting partials" partials.each{|f| File.delete f } end |
#doc ⇒ Object
43 44 45 46 |
# File 'lib/scraper.rb', line 43 def doc log "reading doc #{url}" Nokogiri::HTML(open(url)) end |
#log(message) ⇒ Object
25 26 27 28 |
# File 'lib/scraper.rb', line 25 def log() puts `echo '[#{Time.now.to_s}] #{message}' >> #{LOG}` end |
#partial_filename(start = nil, last = nil) ⇒ Object
60 61 62 63 64 |
# File 'lib/scraper.rb', line 60 def partial_filename(start=nil, last=nil) start ||= @start last ||= @last File.join(CHARACTER_TABLE_DIRECTORY, "chars_#{"%05i" % start.to_i}-#{"%05i" % last.to_i}.txt") end |
#partials ⇒ Object
91 92 93 |
# File 'lib/scraper.rb', line 91 def partials Dir.entries(CHARACTER_TABLE_DIRECTORY).select{|f| f =~ /^chars_/ }.map{|f| File.join(CHARACTER_TABLE_DIRECTORY, f) } end |
#record_characters ⇒ Object
66 67 68 69 |
# File 'lib/scraper.rb', line 66 def record_characters log "Creating new file" f = File.open(partial_filename(@start, @next),"w+");f.puts @content;f.close end |
#retrieve ⇒ Object
122 123 124 125 |
# File 'lib/scraper.rb', line 122 def retrieve scrape cleanup end |
#scrape ⇒ Object
76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
# File 'lib/scraper.rb', line 76 def scrape log "Starting scrape" while (@next == 0 ) || (@next > 0 && @charset.size > 0) do if !already_recorded.nil? log "Already recorded (File exists: #{already_recorded})" collect_characters File.read(already_recorded).split("\n") else collect_characters character_table record_characters end @start = @next end @output_file.close end |
#supposed_leap(start = nil) ⇒ Object
38 39 40 41 |
# File 'lib/scraper.rb', line 38 def supposed_leap(start=nil) start ||= @start File.readlines(associated_file(start)).size end |
#url ⇒ Object
30 31 32 |
# File 'lib/scraper.rb', line 30 def url SOURCE % @start.to_i.to_s end |