9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
# File 'lib/article_crux.rb', line 9
def self.fetch(url, user_agent="ArticleCrux(https://github.com/amitsaxena/article_crux)")
url = (url =~ /^(http|https):\/\/(.)*/i) ? url : "http://#{url}"
probe = HTTParty.head(url, headers: {"User-Agent" => user_agent})
if(probe.content_type && probe.content_type.split('/')[0] == "image")
return {:image => url, :title => nil, :tags => []}
end
begin
res = HTTParty.get(url, headers: {"User-Agent" => user_agent})
raise "Unable to crawl URL" if res.code != 200
doc = Nokogiri::HTML(res)
rescue
doc = Nokogiri::HTML(open(url, "User-Agent" => user_agent))
end
og_url = doc.search("//meta[@property='og:url' or @name='og:url']")
if (!og_url.empty? && !str_blank?(og_url[0]["content"]) && (og_url[0]["content"] =~ /^(http|https):\/\/(.)*/i) && (url != og_url[0]["content"]))
begin
res = HTTParty.get(og_url[0]["content"], headers: {"User-Agent" => user_agent})
raise "Unable to crawl URL" if res.code != 200
doc = Nokogiri::HTML(res)
rescue
end
end
og_image = doc.search("//meta[@property='og:image' or @name='og:image']")
og_images = []
if !og_image.empty?
og_image.each do |ogi|
if !str_blank?(ogi["content"])
image = ogi["content"]
if (image =~ /^\/\/(.)*/)
uri = URI.parse(url)
image = "#{uri.scheme}:#{image}"
elsif (image =~ /^\/(.)*/)
uri = URI.parse(url)
image = File.join("#{uri.scheme}://#{uri.host}", image)
end
og_images << image
end
end
end
image = get_best_image(og_images)
og_size = FastImage.size(Addressable::URI.escape(image)) if !str_blank?(image)
if (str_blank?(image) || og_size.nil? || (og_size[0] < 100 && og_size[1] < 100))
image = nil image_paths = []
page_images = doc.search("//img")
page_images.each do |page_image|
next if (str_blank?(page_image["src"]))
clip_image = page_image["src"]
if (clip_image && !(clip_image =~ /^(http|https):\/\/(.)*/i))
base = doc.search("//base")[0]
base_url = base["href"] if (!base.nil? && !str_blank?(base["href"]))
uri = URI.parse(url)
if (clip_image =~ /^\/(.)*/)
base_url = "#{uri.scheme}://#{uri.host}" if str_blank?(base_url)
else
base_url = "#{uri.scheme}://#{uri.host}#{uri.path[%r{^(.*[\/])}]}" if str_blank?(base_url)
end
clip_image = File.join(base_url, clip_image)
end
image_paths << clip_image
end
end
og_title = doc.search("//meta[@property='og:title' or @name='og:title']")
if (!og_title.empty? && !str_blank?(og_title[0]["content"]))
clip_title = og_title[0]["content"]
else
page_title = doc.search("//title")[0]
clip_title = page_title.text if !page_title.nil?
end
tags = []
possible_tags = doc.xpath('//meta[contains(@name, "tag") or contains(@name, "keyword") or contains(@property, "tag") or contains(@property, "keyword")]')
possible_tags.each{|e| tags << e["content"].split(',') if !str_blank?(e["content"])}
tags = tags.flatten.map(&:strip).uniq
res = {:image => image, :title => clip_title, :tags => tags}
end
|