Class: DomainsScanner::Crawlers::Baidu

Inherits:
Base
  • Object
show all
Defined in:
lib/domains_scanner/crawlers/baidu.rb

Instance Method Summary collapse

Methods inherited from Base

#agent, #have_next_page?, #parse_next_page_link, #search_by_form, #search_by_link, #search_keyword

Instance Method Details

#hostObject



4
5
6
# File 'lib/domains_scanner/crawlers/baidu.rb', line 4

def host
  "https://www.baidu.com"
end

#keyword_field_nameObject



8
9
10
# File 'lib/domains_scanner/crawlers/baidu.rb', line 8

def keyword_field_name
  "wd"
end


32
33
34
# File 'lib/domains_scanner/crawlers/baidu.rb', line 32

def next_page_link_selector
  "#page strong+a"
end

#parse_results(doc) ⇒ Object

“xxx”, url: “xxx”, …


13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
# File 'lib/domains_scanner/crawlers/baidu.rb', line 13

def parse_results(doc)
  items = doc.search(".result")
  items.map do |i|
    title = i.search("h3.t > a").text
    # Baidu encrypted the target url, so we can use show url only, but it is enough!
    # bbs.abc.net/for...php?...
    show_url = i.search("div:last-child > a.c-showurl")
    url = if show_url
      if show_url.text.start_with?("http")
        show_url.text
      else
        "http://#{show_url.text}"
      end
    end

    { title: i.text, url: URI.encode(url) }
  end
end