Class: Textract::Client

Inherits:
Object
  • Object
show all
Defined in:
lib/textract.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url, selectors, format) ⇒ Client



128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# File 'lib/textract.rb', line 128

def initialize(url, selectors, format)
  @url = url
  agent = Mechanize.new
  agent.user_agent_alias = 'Mac Safari'
  @html = agent.get(url).content
  @tags = Textract.get_og_tags(@html, url)
  if @tags.url.match(/^(http|ftp)s?:\/\//)
    @url = @tags.url
  end

  @article = Textract.smart_extract(@html, @tags.description, selectors)
  if @article.content.nil?
    @text = ""
  else
    if format == 'markdown'
      @text = ReverseMarkdown.convert @article.content, unknown_tags: :bypass
    else
      @text = @article.content
    end
  end
  @md5 = Textract.generate_hash @text
  @author = Textract.build_author @article, @html
  @site = Textract.build_site @url, @html
  @title = @tags.title || Textract.get_page_title(@html)
  if @url.match(/\/robots.txt$/) and @title = @text
    @title = @url
  end
end

Instance Attribute Details

#authorObject (readonly)

Returns the value of attribute author.



125
126
127
# File 'lib/textract.rb', line 125

def author
  @author
end

#htmlObject (readonly)

Returns the value of attribute html.



119
120
121
# File 'lib/textract.rb', line 119

def html
  @html
end

#md5Object (readonly)

Returns the value of attribute md5.



124
125
126
# File 'lib/textract.rb', line 124

def md5
  @md5
end

#siteObject (readonly)

Returns the value of attribute site.



126
127
128
# File 'lib/textract.rb', line 126

def site
  @site
end

#tagsObject (readonly)

Returns the value of attribute tags.



121
122
123
# File 'lib/textract.rb', line 121

def tags
  @tags
end

#textObject (readonly)

Returns the value of attribute text.



123
124
125
# File 'lib/textract.rb', line 123

def text
  @text
end

#titleObject (readonly)

Returns the value of attribute title.



122
123
124
# File 'lib/textract.rb', line 122

def title
  @title
end

#urlObject (readonly)

Returns the value of attribute url.



120
121
122
# File 'lib/textract.rb', line 120

def url
  @url
end

Instance Method Details

#as_jsonObject



157
158
159
# File 'lib/textract.rb', line 157

def as_json
  to_h.to_json
end

#to_hObject



161
162
163
164
165
166
167
168
169
# File 'lib/textract.rb', line 161

def to_h
  {
    url: @url,
    text: @text,
    md5: @md5,
    author: @author,
    title: @title,
  }
end