Module: Textract
- Defined in:
- lib/textract.rb,
lib/textract/version.rb
Defined Under Namespace
Classes: Client
Constant Summary collapse
- TAG_WHITELIST =
attr_accessor :client
%w[ div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong figure ]- VERSION =
"0.0.18"
Class Method Summary collapse
- .build_author(article, html) ⇒ Object
- .build_site(url, html) ⇒ Object
- .generate_hash(text) ⇒ Object
- .get_author(html) ⇒ Object
- .get_og_tags(html, url) ⇒ Object
- .get_page_title(html) ⇒ Object
- .get_text(url, selectors = nil, format = "markdown") ⇒ Object
- .get_twitter(html) ⇒ Object
- .smart_extract(html, description, selectors) ⇒ Object
Class Method Details
.build_author(article, html) ⇒ Object
107 108 109 110 111 112 |
# File 'lib/textract.rb', line 107 def self.(article, html) { name: article. || (html), twitter: get_twitter(html), } end |
.build_site(url, html) ⇒ Object
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
# File 'lib/textract.rb', line 81 def self.build_site(url, html) site_twitter = Nokogiri::HTML(html).search('meta[name="twitter:site"]') site_name = Nokogiri::HTML(html).search('meta[property="og:site_name"]') if site_name.empty? site = url.match(/(http|ftp)s?:\/\/((\w+\.)?(\w+\.)(\w+))\//) site = site[2] unless site[2].nil? site_name = site.sub(/^www\./, '').capitalize! else site_name = site_name.attribute('content').value end if !site_twitter.empty? if !site_twitter.attribute('content').nil? site_twitter = site_twitter.attribute('content').value elsif !site_twitter.attribute('value').nil? site_twitter = site_twitter.attribute('value').value else site_twitter = nill end end { name: site_name, twitter: site_twitter, } end |
.generate_hash(text) ⇒ Object
114 115 116 |
# File 'lib/textract.rb', line 114 def self.generate_hash(text) Digest::MD5.hexdigest text end |
.get_author(html) ⇒ Object
68 69 70 71 72 73 74 |
# File 'lib/textract.rb', line 68 def self.(html) = Nokogiri::HTML(html).search('meta[name="author"]') if .empty? = Nokogiri::HTML(html).search('meta[property="author"]') end .attribute('content').value unless .empty? end |
.get_og_tags(html, url) ⇒ Object
19 20 21 22 23 24 25 |
# File 'lib/textract.rb', line 19 def self.(html, url) begin OpenGraph.new(html) rescue OpenGraph.new(url) end end |
.get_page_title(html) ⇒ Object
64 65 66 |
# File 'lib/textract.rb', line 64 def self.get_page_title(html) Nokogiri::HTML(html).search('head').search('title').text end |
.get_text(url, selectors = nil, format = "markdown") ⇒ Object
15 16 17 |
# File 'lib/textract.rb', line 15 def self.get_text(url, selectors=nil, format="markdown") @client = Client.new(url, selectors, format) end |
.get_twitter(html) ⇒ Object
76 77 78 79 |
# File 'lib/textract.rb', line 76 def self.get_twitter(html) = Nokogiri::HTML(html).search('meta[name="twitter:creator"]') .attribute('content').value unless .empty? end |
.smart_extract(html, description, selectors) ⇒ Object
27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/textract.rb', line 27 def self.smart_extract(html, description, selectors) doc = Nokogiri::HTML html if selectors.nil? article = doc.search('article') else article = doc.search(selectors) end if article.count == 1 article_el = article[0] elsif !description.nil? and article.count == 0 els = [1,2,3] i = 1 until els.count < 2 search_text = description.split(" ")[0..i].join(" ") if search_text.index "'" els = doc.search "[text()*=\"#{search_text}\"]" else els = doc.search "[text()*='#{search_text}']" end i += 1 end if els.count == 1 el = els[0] article_el = el.parent else # do something else if multiple or no matches end else article_el = doc end Readability::Document.new(article_el.to_s, tags: TAG_WHITELIST, attributes: %w[src href], remove_empty_nodes: false, ) end |