Class: Collection::Taobao

Inherits:

Object

Object
Collection::Taobao

Defined in:: lib/collection/taobao.rb

Class Method Summary collapse

Class Method Details

.down(urls) ⇒ `Object`

# File 'lib/collection/taobao.rb', line 43

def self.down(urls)
  urls.each_with_index do |url, idx|
    url = url[0..3] == "http" ? url : "https:#{url}"
    data = open(url, 'User-Agent' => 'ruby') {|f| f.read}
    fmidx = url.rindex(".")
    fm = url[fmidx + 1, url.length - fmidx]

    path = "#{Rails.root}/taobao"
    FileUtils.mkdir(path) unless Dir.exists? path
    file = File.new("#{path}/#{idx}.#{fm}", 'w+')
    file.binmode
    file << data
    file.flush
    file.close
  end
  merge_image
end

.get_text(urls) ⇒ `Object`

# File 'lib/collection/taobao.rb', line 27

def self.get_text(urls)
  gettoken = Struggle::Http.new("https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=ob5etUcqOmPr7HA5co98yreC&client_secret=puRIlGWeOtaLbbPr5zZfeyNgLEC88wyF")
  tokenresult = eval gettoken.post.body
  token = tokenresult[:access_token]
  words = []
  urls.each do |url|
    getText = Struggle::Http.new("https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic?access_token=" + token)
    text = getText.post({url: url}, {"Content-Type" => "application/x-www-form-urlencoded"})
    r = eval text.body
    if r[:words_result] && r[:words_result].length > 0
      words += r[:words_result].collect {|w| w[:words]}
    end
  end
  words.join("\r\n")
end

.merge_image ⇒ `Object`

# File 'lib/collection/taobao.rb', line 61

def self.merge_image
  images = []
  Dir.foreach("#{Rails.root}/taobao").each do |file|
    if file != "." and file != ".."
      images << "#{Rails.root}/taobao/" + file
    end
  end
  saveImgPath = "#{Rails.root}/taobao_img/#{Time.now.strftime "%Y%m%d%H%M%S"}.jpg"
  system <<-EOF
  convert -append #{images.join(" ")} #{saveImgPath}
  EOF
  saveImgPath
end

.run(url) ⇒ `Object`

# File 'lib/collection/taobao.rb', line 6

def self.run(url)
  # 注释部分为浏览器驱动，可以隐藏运行浏览器。但是抓取淘宝失效，淘宝必须开启浏览器。
  # filePath = File.expand_path(File.dirname(File.dirname(__FILE__)))
  # chromedriverPath = File.expand_path("collection/chromedriver", filePath)
  # Selenium::WebDriver::Chrome.driver_path = chromedriverPath
  # options = Selenium::WebDriver::Chrome::Options.new
  # options.add_argument("headless")
  # browser = Watir::Browser.new :chrome, options: options
  browser = Watir::Browser.new :chrome
  browser.goto url
  images = []
  browser.div(id: "description").imgs.each do |img|
    img.scroll_into_view
    sleep 2
    puts img.src
    images << img.src
  end
  browser.close
  {imgpath: down(images), text: get_text(images)}
end

Class: Collection::Taobao

Class Method Summary collapse

Class Method Details

.down(urls) ⇒ Object

.get_text(urls) ⇒ Object

.merge_image ⇒ Object

.run(url) ⇒ Object

.down(urls) ⇒ `Object`

.get_text(urls) ⇒ `Object`

.merge_image ⇒ `Object`

.run(url) ⇒ `Object`