Class: Twexicon::Scraper
- Inherits:
-
Object
- Object
- Twexicon::Scraper
- Defined in:
- lib/twexicon/scraper.rb
Instance Attribute Summary collapse
-
#tweets ⇒ Object
readonly
Returns the value of attribute tweets.
Instance Method Summary collapse
-
#initialize(username) ⇒ Scraper
constructor
A new instance of Scraper.
- #refine_tweets ⇒ Object
- #scrape_tweets(username) ⇒ Object
Constructor Details
#initialize(username) ⇒ Scraper
Returns a new instance of Scraper.
4 5 6 7 |
# File 'lib/twexicon/scraper.rb', line 4 def initialize(username) @tweets = {} scrape_tweets(username) end |
Instance Attribute Details
#tweets ⇒ Object (readonly)
Returns the value of attribute tweets.
2 3 4 |
# File 'lib/twexicon/scraper.rb', line 2 def tweets @tweets end |
Instance Method Details
#refine_tweets ⇒ Object
13 14 15 16 17 18 19 20 21 22 23 24 25 26 |
# File 'lib/twexicon/scraper.rb', line 13 def refine_tweets tweets.each do |num, tweet| t = tweet.keys[0].dup t.scan(/pic.twitter.com\/\w{10}/){|p| tweet.values[0][:pix] << p.strip}.gsub!(/pic.twitter.com\/\w{10}/, " ") t.scan(/https?:\/\/[\w\.\?\=\&\-\/\#]+/){|w| tweet.values[0][:links] << w.strip}.gsub!(/https?:\/\/[\w\.\?\=\&\-\/\#]+/, " ") t.scan(/#\w+/){|h| tweet.values[0][:hashtags] << h.gsub(/\W/, "").prepend("#")}.gsub!(/#\w+/, " ") t.scan(/@\w+/){|u| tweet.values[0][:usernames] << u.gsub(/\W/, "").prepend("@")}.gsub!(/@\w+/, " ") t.scan(/(\d+[:\.\b]?\d*)+/){|n| tweet.values[0][:numbers] << n.first.gsub(/(^\W+|\W+$)/, "")}.gsub!(/(\d+[:\.\b]?\d*)+/, " ") t.scan(/(\b[A-Z][\.\b][A-Z][\.\b][A-Z][\.\b]|\b[A-Z][\.\b][A-Z][\.\b])/){|a| tweet.values[0][:acronyms] << a.first.strip}.gsub!(/(\b[A-Z][\.\b][A-Z][\.\b][A-Z][\.\b]|\b[A-Z][\.\b][A-Z][\.\b])/, " ") t.scan(/(([A-Z]+\W){2,}|[A-Z]{4,}\W)/){|s| tweet.values[0][:shouts] << s.first.gsub(/\W/, " ").strip}.gsub!(/(([A-Z]+\W){2,}|[A-Z]{4,}\W)/, " ") t.scan(/\b[A-Z]{2,3}\b/){|a| tweet.values[0][:acronyms] << a.strip}.gsub!(/\b[A-Z]{2,3}\b/, " ") t.scan(/\w+['\/]?\w*/){|w| tweet.values[0][:words] << w.strip}.gsub!(/\w+['\/]?\w*/, " ") end end |
#scrape_tweets(username) ⇒ Object
9 10 11 |
# File 'lib/twexicon/scraper.rb', line 9 def scrape_tweets(username) Nokogiri::HTML(open("https://twitter.com/#{username}")).css(".tweet-text").each{|t| tweets[tweets.length+1] = {t.text => {:pix => [], :links => [], :hashtags => [], :usernames => [], :numbers => [], :acronyms => [], :shouts => [], :words => []}}} end |