Class: Textract::Client
- Inherits:
-
Object
- Object
- Textract::Client
- Defined in:
- lib/textract.rb
Instance Attribute Summary collapse
-
#author ⇒ Object
readonly
Returns the value of attribute author.
-
#html ⇒ Object
readonly
Returns the value of attribute html.
-
#md5 ⇒ Object
readonly
Returns the value of attribute md5.
-
#site ⇒ Object
readonly
Returns the value of attribute site.
-
#tags ⇒ Object
readonly
Returns the value of attribute tags.
-
#text ⇒ Object
readonly
Returns the value of attribute text.
-
#title ⇒ Object
readonly
Returns the value of attribute title.
-
#url ⇒ Object
readonly
Returns the value of attribute url.
Instance Method Summary collapse
- #as_json ⇒ Object
-
#initialize(url, selectors, format) ⇒ Client
constructor
A new instance of Client.
- #to_h ⇒ Object
Constructor Details
#initialize(url, selectors, format) ⇒ Client
128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
# File 'lib/textract.rb', line 128 def initialize(url, selectors, format) @url = url agent = Mechanize.new agent.user_agent_alias = 'Mac Safari' @html = agent.get(url).content = Textract.(@html, url) if .url.match(/^(http|ftp)s?:\/\//) @url = .url end @article = Textract.smart_extract(@html, .description, selectors) if @article.content.nil? @text = "" else if format == 'markdown' @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass else @text = @article.content end end @md5 = Textract.generate_hash @text = Textract. @article, @html @site = Textract.build_site @url, @html @title = .title || Textract.get_page_title(@html) if @url.match(/\/robots.txt$/) and @title = @text @title = @url end end |
Instance Attribute Details
#author ⇒ Object (readonly)
Returns the value of attribute author.
125 126 127 |
# File 'lib/textract.rb', line 125 def end |
#html ⇒ Object (readonly)
Returns the value of attribute html.
119 120 121 |
# File 'lib/textract.rb', line 119 def html @html end |
#md5 ⇒ Object (readonly)
Returns the value of attribute md5.
124 125 126 |
# File 'lib/textract.rb', line 124 def md5 @md5 end |
#site ⇒ Object (readonly)
Returns the value of attribute site.
126 127 128 |
# File 'lib/textract.rb', line 126 def site @site end |
#tags ⇒ Object (readonly)
Returns the value of attribute tags.
121 122 123 |
# File 'lib/textract.rb', line 121 def end |
#text ⇒ Object (readonly)
Returns the value of attribute text.
123 124 125 |
# File 'lib/textract.rb', line 123 def text @text end |
#title ⇒ Object (readonly)
Returns the value of attribute title.
122 123 124 |
# File 'lib/textract.rb', line 122 def title @title end |
#url ⇒ Object (readonly)
Returns the value of attribute url.
120 121 122 |
# File 'lib/textract.rb', line 120 def url @url end |
Instance Method Details
#as_json ⇒ Object
157 158 159 |
# File 'lib/textract.rb', line 157 def as_json to_h.to_json end |
#to_h ⇒ Object
161 162 163 164 165 166 167 168 169 |
# File 'lib/textract.rb', line 161 def to_h { url: @url, text: @text, md5: @md5, author: , title: @title, } end |