Class: Textract::Client
- Inherits:
-
Object
- Object
- Textract::Client
- Defined in:
- lib/textract.rb
Instance Attribute Summary collapse
-
#author ⇒ Object
readonly
Returns the value of attribute author.
-
#html ⇒ Object
readonly
Returns the value of attribute html.
-
#md5 ⇒ Object
readonly
Returns the value of attribute md5.
-
#site ⇒ Object
readonly
Returns the value of attribute site.
-
#tags ⇒ Object
readonly
Returns the value of attribute tags.
-
#text ⇒ Object
readonly
Returns the value of attribute text.
-
#title ⇒ Object
readonly
Returns the value of attribute title.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
- #as_json ⇒ Object
-
#initialize(url, selectors, format) ⇒ Client
constructor
A new instance of Client.
- #to_h ⇒ Object
Constructor Details
#initialize(url, selectors, format) ⇒ Client
Returns a new instance of Client.
139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
# File 'lib/textract.rb', line 139 def initialize(url, selectors, format) @url = url agent = Mechanize.new agent.user_agent_alias = 'Mac Safari' @html = agent.get(url).content @tags = Textract.(@html, url) if @tags.url.match(/^(http|ftp)s?:\/\//) @url = @tags.url end @article = Textract.smart_extract(@html, @tags.description, selectors) if @article.content.nil? @text = "" else if format == 'markdown' @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass else @text = @article.content end end @md5 = Textract.generate_hash @text @author = Textract. @article, @html @site = Textract.build_site @url, @html @title = @tags.title || Textract.get_page_title(@html) if @url.match(/\/robots.txt$/) and @title = @text @title = @url end end |
Instance Attribute Details
#author ⇒ Object (readonly)
Returns the value of attribute author.
136 137 138 |
# File 'lib/textract.rb', line 136 def @author end |
#html ⇒ Object (readonly)
Returns the value of attribute html.
130 131 132 |
# File 'lib/textract.rb', line 130 def html @html end |
#md5 ⇒ Object (readonly)
Returns the value of attribute md5.
135 136 137 |
# File 'lib/textract.rb', line 135 def md5 @md5 end |
#site ⇒ Object (readonly)
Returns the value of attribute site.
137 138 139 |
# File 'lib/textract.rb', line 137 def site @site end |
#tags ⇒ Object (readonly)
Returns the value of attribute tags.
132 133 134 |
# File 'lib/textract.rb', line 132 def @tags end |
#text ⇒ Object (readonly)
Returns the value of attribute text.
134 135 136 |
# File 'lib/textract.rb', line 134 def text @text end |
#title ⇒ Object (readonly)
Returns the value of attribute title.
133 134 135 |
# File 'lib/textract.rb', line 133 def title @title end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
131 132 133 |
# File 'lib/textract.rb', line 131 def url @url end |
Instance Method Details
#as_json ⇒ Object
168 169 170 |
# File 'lib/textract.rb', line 168 def as_json to_h.to_json end |
#to_h ⇒ Object
172 173 174 175 176 177 178 179 180 181 |
# File 'lib/textract.rb', line 172 def to_h { url: @url, text: @text, md5: @md5, author: @author, title: @title, site: @site, } end |