Class: Textract::Client

Inherits:
Object
  • Object
show all
Defined in:
lib/textract.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, selectors, format) ⇒ Client

Returns a new instance of Client.



139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
# File 'lib/textract.rb', line 139

def initialize(url, selectors, format)
  @url = url
  agent = Mechanize.new
  agent.user_agent_alias = 'Mac Safari'
  @html = agent.get(url).content
  @tags = Textract.get_og_tags(@html, url)
  if @tags.url.match(/^(http|ftp)s?:\/\//)
    @url = @tags.url
  end

  @article = Textract.smart_extract(@html, @tags.description, selectors)
  if @article.content.nil?
    @text = ""
  else
    if format == 'markdown'
      @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
    else
      @text = @article.content
    end
  end
  @md5 = Textract.generate_hash @text
  @author = Textract.build_author @article, @html
  @site = Textract.build_site @url, @html
  @title = @tags.title || Textract.get_page_title(@html)
  if @url.match(/\/robots.txt$/) and @title = @text
    @title = @url
  end
end

Instance Attribute Details

#authorObject (readonly)

Returns the value of attribute author.



136
137
138
# File 'lib/textract.rb', line 136

def author
  @author
end

#htmlObject (readonly)

Returns the value of attribute html.



130
131
132
# File 'lib/textract.rb', line 130

def html
  @html
end

#md5Object (readonly)

Returns the value of attribute md5.



135
136
137
# File 'lib/textract.rb', line 135

def md5
  @md5
end

#siteObject (readonly)

Returns the value of attribute site.



137
138
139
# File 'lib/textract.rb', line 137

def site
  @site
end

#tagsObject (readonly)

Returns the value of attribute tags.



132
133
134
# File 'lib/textract.rb', line 132

def tags
  @tags
end

#textObject (readonly)

Returns the value of attribute text.



134
135
136
# File 'lib/textract.rb', line 134

def text
  @text
end

#titleObject (readonly)

Returns the value of attribute title.



133
134
135
# File 'lib/textract.rb', line 133

def title
  @title
end

#urlObject (readonly)

Returns the value of attribute url.



131
132
133
# File 'lib/textract.rb', line 131

def url
  @url
end

Instance Method Details

#as_jsonObject



168
169
170
# File 'lib/textract.rb', line 168

def as_json
  to_h.to_json
end

#to_hObject



172
173
174
175
176
177
178
179
180
181
# File 'lib/textract.rb', line 172

def to_h
  {
    url: @url,
    text: @text,
    md5: @md5,
    author: @author,
    title: @title,
    site: @site,
  }
end