Class: KdnuggetsRoundup::DataWrassler

Inherits:
Object
  • Object
show all
Defined in:
lib/datawrassler.rb

Constant Summary collapse

BASE_URL =

web scraper class

'https://www.kdnuggets.com'
TOP_STORIES_PATH =
'/news/top-stories.html'

Instance Method Summary collapse

Instance Method Details

#wrassle_article_attributes(article_url) ⇒ Object



29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# File 'lib/datawrassler.rb', line 29

def wrassle_article_attributes(article_url)
  #helper method to be called inside wrassle_top_stories
  doc = Nokogiri::HTML(open(article_url))
  tags = doc.css('div.tag-data a')
  tags = tags.collect{|tag| tag.text}
  summary = doc.css('p.excerpt').text
  author = doc.css('#post- b').text.match(/\S*\s\S*[[:punct:]]/)[0].gsub(/[0-9[[:punct:]]]/, '')
  article = doc.css('p, ol, ul')
  counter = 0
  excerpt = []
  article.each do |paragraph|
    counter += 1
    if counter < 3 #=> first two elements are normally bylines or other fluff
      next
    elsif counter > 8 #=> ensures only 5 elements make it through
      break
    end
    excerpt << paragraph.text
  end
  #excerpt = excerpt.delete_if{|x| x ==''}
  {author: author, tags: tags, summary: summary, excerpt: excerpt}
end

#wrassle_top_storiesObject

> Note there are 7 stories in both most popular and most shared each week



11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# File 'lib/datawrassler.rb', line 11

def wrassle_top_stories #=> Note there are 7 stories in both most popular and most shared each week
  doc = Nokogiri::HTML(open(BASE_URL + TOP_STORIES_PATH))
  stories = doc.css('ol.three_ol li')
  counter = 0
  stories.each do |story|
    counter += 1
    url = BASE_URL + story.css('a').attribute('href').text
    title = story.css('b').text
    if KdnuggetsRoundup::Article.find_by_title(title) #=> there are always 14 stories, but there are often duplicates
      article = KdnuggetsRoundup::Article.find_by_title(title)
    else
      article = KdnuggetsRoundup::Article.new(title, url)
      article.assign_attributes(wrassle_article_attributes(url))
    end
    counter < 8 ? article.add_to_popular : article.add_to_shared #=> top stories shows the 7 most popular and 7 most shared articles
  end
end