Module: Textract
- Defined in:
- lib/textract.rb,
lib/textract/version.rb
Defined Under Namespace
Classes: Client
Constant Summary collapse
- VERSION =
"0.0.1"
Class Method Summary collapse
- .get_og_tags(html) ⇒ Object
- .get_page_title(html) ⇒ Object
-
.get_text(url, selectors = nil) ⇒ Object
attr_accessor :client.
- .get_text_from_description(html, description, selectors) ⇒ Object
Class Method Details
.get_og_tags(html) ⇒ Object
15 16 17 |
# File 'lib/textract.rb', line 15 def self.(html) OpenGraph.new(html) end |
.get_page_title(html) ⇒ Object
56 57 58 |
# File 'lib/textract.rb', line 56 def self.get_page_title(html) Nokogiri::HTML(html).search('title').text end |
.get_text(url, selectors = nil) ⇒ Object
attr_accessor :client
11 12 13 |
# File 'lib/textract.rb', line 11 def self.get_text(url, selectors=nil) @client = Client.new(url, selectors) end |
.get_text_from_description(html, description, selectors) ⇒ Object
19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
# File 'lib/textract.rb', line 19 def self.get_text_from_description(html, description, selectors) doc = Nokogiri::HTML html if selectors.nil? article = doc.search('article') else article = doc.search(selectors) end if article.count == 1 article_el = article[0] elsif !description.nil? and article.count == 0 els = [1,2,3] i = 1 until els.count < 2 search_text = description.split(" ")[0..i].join(" ") puts search_text els = doc.search "[text()*='#{search_text}']" i += 1 end if els.count == 1 el = els[0] article_el = el.parent else # do something else if multiple or no matches end else article_el = doc end article = Readability::Document.new(article_el.to_s, tags: %w[div span p a img ul ol li blockquote table tr td h1 h2 h3 h4 h5 b em i strong], attributes: %w[src href], remove_empty_nodes: true, ).content markdown = ReverseMarkdown.convert article, unknown_tags: :bypass # TODO change to drop once article is supported by reversemarkdown markdown end |