Class: Collection::Taobao

Inherits:
Object
  • Object
show all
Defined in:
lib/collection/taobao.rb

Class Method Summary collapse

Class Method Details

.down(urls) ⇒ Object



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/collection/taobao.rb', line 43

def self.down(urls)
  urls.each_with_index do |url, idx|
    url = url[0..3] == "http" ? url : "https:#{url}"
    data = open(url, 'User-Agent' => 'ruby') {|f| f.read}
    fmidx = url.rindex(".")
    fm = url[fmidx + 1, url.length - fmidx]

    path = "#{Rails.root}/taobao"
    FileUtils.mkdir(path) unless Dir.exists? path
    file = File.new("#{path}/#{idx}.#{fm}", 'w+')
    file.binmode
    file << data
    file.flush
    file.close
  end
  merge_image
end

.get_text(urls) ⇒ Object



27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/collection/taobao.rb', line 27

def self.get_text(urls)
  gettoken = Struggle::Http.new("https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=ob5etUcqOmPr7HA5co98yreC&client_secret=puRIlGWeOtaLbbPr5zZfeyNgLEC88wyF")
  tokenresult = eval gettoken.post.body
  token = tokenresult[:access_token]
  words = []
  urls.each do |url|
    getText = Struggle::Http.new("https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic?access_token=" + token)
    text = getText.post({url: url}, {"Content-Type" => "application/x-www-form-urlencoded"})
    r = eval text.body
    if r[:words_result] && r[:words_result].length > 0
      words += r[:words_result].collect {|w| w[:words]}
    end
  end
  words.join("\r\n")
end

.merge_imageObject



61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/collection/taobao.rb', line 61

def self.merge_image
  images = []
  Dir.foreach("#{Rails.root}/taobao").each do |file|
    if file != "." and file != ".."
      images << "#{Rails.root}/taobao/" + file
    end
  end
  saveImgPath = "#{Rails.root}/taobao_img/#{Time.now.strftime "%Y%m%d%H%M%S"}.jpg"
  system <<-EOF
  convert -append #{images.join(" ")} #{saveImgPath}
  EOF
  saveImgPath
end

.run(url) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
# File 'lib/collection/taobao.rb', line 6

def self.run(url)
  # 注释部分为浏览器驱动,可以隐藏运行浏览器。但是抓取淘宝失效,淘宝必须开启浏览器。
  # filePath = File.expand_path(File.dirname(File.dirname(__FILE__)))
  # chromedriverPath = File.expand_path("collection/chromedriver", filePath)
  # Selenium::WebDriver::Chrome.driver_path = chromedriverPath
  # options = Selenium::WebDriver::Chrome::Options.new
  # options.add_argument("headless")
  # browser = Watir::Browser.new :chrome, options: options
  browser = Watir::Browser.new :chrome
  browser.goto url
  images = []
  browser.div(id: "description").imgs.each do |img|
    img.scroll_into_view
    sleep 2
    puts img.src
    images << img.src
  end
  browser.close
  {imgpath: down(images), text: get_text(images)}
end