Class: NHKore::BingScraper
- Inherits:
-
SearchScraper
- Object
- Scraper
- SearchScraper
- NHKore::BingScraper
- Defined in:
- lib/nhkore/search_scraper.rb
Constant Summary
Constants inherited from SearchScraper
SearchScraper::DEFAULT_RESULT_COUNT, SearchScraper::FUTSUU_REGEX, SearchScraper::FUTSUU_SITE, SearchScraper::IGNORE_LINK_REGEX, SearchScraper::YASASHII_REGEX, SearchScraper::YASASHII_SITE
Constants inherited from Scraper
Instance Attribute Summary collapse
-
#regex ⇒ Object
readonly
Returns the value of attribute regex.
-
#site ⇒ Object
readonly
Returns the value of attribute site.
Attributes inherited from Scraper
#kargs, #max_redirects, #max_retries, #redirect_rule, #str_or_io, #url
Class Method Summary collapse
Instance Method Summary collapse
-
#initialize(site, regex: nil, url: nil, **kargs) ⇒ BingScraper
constructor
A new instance of BingScraper.
- #scrape(slinks, page = NextPage.new()) ⇒ Object
- #scrape_html(slinks, page, next_page = NextPage.new()) ⇒ Object
- #scrape_rss(slinks, page, next_page = NextPage.new()) ⇒ Object
Methods inherited from SearchScraper
#fetch_valid_link?, #ignore_link?
Methods inherited from Scraper
#fetch_cookie, #html_doc, #join_url, #open, #open_file, #open_url, #read, #reopen, #rss_doc
Constructor Details
#initialize(site, regex: nil, url: nil, **kargs) ⇒ BingScraper
Returns a new instance of BingScraper.
90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
# File 'lib/nhkore/search_scraper.rb', line 90 def initialize(site,regex: nil,url: nil,**kargs) case site when :futsuu regex = FUTSUU_REGEX if regex.nil? site = FUTSUU_SITE when :yasashii regex = YASASHII_REGEX if regex.nil? site = YASASHII_SITE else raise ArgumentError,"invalid site[#{site}]" end raise ArgumentError,"empty regex[#{regex}]" if regex.nil? @regex = regex @site = site url = self.class.build_url(site,**kargs) if url.nil? # Delete class-specific args (don't pass to Open-URI). kargs.delete(:count) super(url,**kargs) end |
Instance Attribute Details
#regex ⇒ Object (readonly)
Returns the value of attribute regex.
87 88 89 |
# File 'lib/nhkore/search_scraper.rb', line 87 def regex @regex end |
#site ⇒ Object (readonly)
Returns the value of attribute site.
88 89 90 |
# File 'lib/nhkore/search_scraper.rb', line 88 def site @site end |
Class Method Details
.build_url(site, count: DEFAULT_RESULT_COUNT, **_kargs) ⇒ Object
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
# File 'lib/nhkore/search_scraper.rb', line 114 def self.build_url(site,count: DEFAULT_RESULT_COUNT,**_kargs) url = ''.dup url << 'https://www.bing.com/search?' url << URI.encode_www_form( q: "site:#{site}", count: count, qs: 'n', sp: '-1', lq: '0', pq: "site:#{site}", sc: '1-25', sk: '', first: '1', FORM: 'PERE', ) return url end |
Instance Method Details
#scrape(slinks, page = NextPage.new()) ⇒ Object
135 136 137 138 139 140 141 142 143 |
# File 'lib/nhkore/search_scraper.rb', line 135 def scrape(slinks,page = NextPage.new()) next_page,link_count = scrape_html(slinks,page) if link_count <= 0 scrape_rss(slinks,page,next_page) end return next_page end |
#scrape_html(slinks, page, next_page = NextPage.new()) ⇒ Object
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
# File 'lib/nhkore/search_scraper.rb', line 145 def scrape_html(slinks,page,next_page = NextPage.new()) doc = html_doc link_count = 0 anchors = doc.css('a') anchors.each do |anchor| href = anchor['href'].to_s href = Util.unspace_web_str(href).downcase next if ignore_link?(href) if (md = href.match(/first=(\d+)/i)) && href =~ /FORM=PERE/i count = md[1].to_i if count > page.count && (next_page.count < 0 || count < next_page.count) next_page.count = count next_page.url = join_url(href) end elsif href =~ regex && fetch_valid_link?(href) slinks.add_link(SearchLink.new(href)) link_count += 1 end end return [next_page,link_count] end |
#scrape_rss(slinks, page, next_page = NextPage.new()) ⇒ Object
173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 |
# File 'lib/nhkore/search_scraper.rb', line 173 def scrape_rss(slinks,page,next_page = NextPage.new()) link_count = 0 if !@is_file uri = URI(@url) Util.replace_uri_query!(uri,format: 'rss') self.open(uri) doc = rss_doc rss_links = [] doc.items.each do |item| link = item.link.to_s link = Util.unspace_web_str(link).downcase rss_links << link next if ignore_link?(link) next if link !~ regex || !fetch_valid_link?(link) slinks.add_link(SearchLink.new(link)) link_count += 1 end # For RSS, Bing will keep returning the same links over and over # if it's the last page or the "first=" query is the wrong count. # Therefore, we have to test the previous RSS links (+page.rss_links+). if next_page.empty? && doc.items.length >= 1 && page.rss_links != rss_links next_page.count = (page.count < 0) ? 0 : page.count next_page.count += doc.items.length next_page.rss_links = rss_links uri = URI(page.url.nil? ? @url : page.url) Util.replace_uri_query!(uri,first: next_page.count) next_page.url = uri end end return [next_page,link_count] end |