Class: Hongkong::News::Scrapers::MingpaoScraper

Inherits:
Object
  • Object
show all
Includes:
PhantomScraper
Defined in:
lib/hongkong/news/scrapers/mingpao_scraper.rb

Constant Summary collapse

LIST_URL =
"http://news.mingpao.com/pns/%E6%96%B0%E8%81%9E%E7%B8%BD%E8%A6%BD/web_tc/archive/latest"

Instance Method Summary collapse

Methods included from PhantomScraper

#html, #screenshot_data

Instance Method Details

#news(url) ⇒ Object

Extract article from page from Mingpao



25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# File 'lib/hongkong/news/scrapers/mingpao_scraper.rb', line 25

def news(url)
  visit url

  # wait for content to be loaded
  first("article p")
  
  document = Document.new
  document.source = 'mingpao'
  document.title = first("h1").text
  document.url = url
  document.html = html
  document.content = page.evaluate_script("HongKongNews.getInnerText('article')")
  document.screenshot_data = screenshot_data

  document
end

Extract all news links from Mingpao



13
14
15
16
17
18
19
20
21
22
# File 'lib/hongkong/news/scrapers/mingpao_scraper.rb', line 13

def news_links
  visit LIST_URL

  all(".listing ul li a").collect do |anchor|
    link = Link.new
    link.title = anchor.text
    link.url = URI::join(LIST_URL, anchor["href"]).to_s
    link
  end
end