Class: Collection::Taobao
- Inherits:
-
Object
- Object
- Collection::Taobao
- Defined in:
- lib/collection/taobao.rb
Class Method Summary collapse
Class Method Details
.down(urls) ⇒ Object
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# File 'lib/collection/taobao.rb', line 43 def self.down(urls) urls.each_with_index do |url, idx| url = url[0..3] == "http" ? url : "https:#{url}" data = open(url, 'User-Agent' => 'ruby') {|f| f.read} fmidx = url.rindex(".") fm = url[fmidx + 1, url.length - fmidx] path = "#{Rails.root}/taobao" FileUtils.mkdir(path) unless Dir.exists? path file = File.new("#{path}/#{idx}.#{fm}", 'w+') file.binmode file << data file.flush file.close end merge_image end |
.get_text(urls) ⇒ Object
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# File 'lib/collection/taobao.rb', line 27 def self.get_text(urls) gettoken = Struggle::Http.new("https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=ob5etUcqOmPr7HA5co98yreC&client_secret=puRIlGWeOtaLbbPr5zZfeyNgLEC88wyF") tokenresult = eval gettoken.post.body token = tokenresult[:access_token] words = [] urls.each do |url| getText = Struggle::Http.new("https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic?access_token=" + token) text = getText.post({url: url}, {"Content-Type" => "application/x-www-form-urlencoded"}) r = eval text.body if r[:words_result] && r[:words_result].length > 0 words += r[:words_result].collect {|w| w[:words]} end end words.join("\r\n") end |
.merge_image ⇒ Object
61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/collection/taobao.rb', line 61 def self.merge_image images = [] Dir.foreach("#{Rails.root}/taobao").each do |file| if file != "." and file != ".." images << "#{Rails.root}/taobao/" + file end end saveImgPath = "#{Rails.root}/taobao_img/#{Time.now.strftime "%Y%m%d%H%M%S"}.jpg" system <<-EOF convert -append #{images.join(" ")} #{saveImgPath} EOF saveImgPath end |
.run(url) ⇒ Object
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 |
# File 'lib/collection/taobao.rb', line 6 def self.run(url) # 注释部分为浏览器驱动,可以隐藏运行浏览器。但是抓取淘宝失效,淘宝必须开启浏览器。 # filePath = File.expand_path(File.dirname(File.dirname(__FILE__))) # chromedriverPath = File.expand_path("collection/chromedriver", filePath) # Selenium::WebDriver::Chrome.driver_path = chromedriverPath # options = Selenium::WebDriver::Chrome::Options.new # options.add_argument("headless") # browser = Watir::Browser.new :chrome, options: options browser = Watir::Browser.new :chrome browser.goto url images = [] browser.div(id: "description").imgs.each do |img| img.scroll_into_view sleep 2 puts img.src images << img.src end browser.close {imgpath: down(images), text: get_text(images)} end |