Class: RubyCrawl::SiteCrawler::PageResult

Inherits:
Object
  • Object
show all
Defined in:
lib/rubycrawl/site_crawler.rb

Overview

Page result yielded to the block with lazy clean_markdown.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url:, html:, raw_text:, clean_html:, links:, metadata:, depth:) ⇒ PageResult

Returns a new instance of PageResult.



13
14
15
16
17
18
19
20
21
# File 'lib/rubycrawl/site_crawler.rb', line 13

def initialize(url:, html:, raw_text:, clean_html:, links:, metadata:, depth:)
  @url        = url
  @html       = html
  @raw_text   = raw_text
  @clean_html = clean_html
  @links      = links
  @metadata   = 
  @depth      = depth
end

Instance Attribute Details

#clean_htmlObject (readonly)

Returns the value of attribute clean_html.



11
12
13
# File 'lib/rubycrawl/site_crawler.rb', line 11

def clean_html
  @clean_html
end

#depthObject (readonly)

Returns the value of attribute depth.



11
12
13
# File 'lib/rubycrawl/site_crawler.rb', line 11

def depth
  @depth
end

#htmlObject (readonly)

Returns the value of attribute html.



11
12
13
# File 'lib/rubycrawl/site_crawler.rb', line 11

def html
  @html
end

Returns the value of attribute links.



11
12
13
# File 'lib/rubycrawl/site_crawler.rb', line 11

def links
  @links
end

#metadataObject (readonly)

Returns the value of attribute metadata.



11
12
13
# File 'lib/rubycrawl/site_crawler.rb', line 11

def 
  @metadata
end

#raw_textObject (readonly)

Returns the value of attribute raw_text.



11
12
13
# File 'lib/rubycrawl/site_crawler.rb', line 11

def raw_text
  @raw_text
end

#urlObject (readonly)

Returns the value of attribute url.



11
12
13
# File 'lib/rubycrawl/site_crawler.rb', line 11

def url
  @url
end

Instance Method Details

#clean_markdownObject

Markdown derived from noise-stripped HTML. Lazy — same as Result#clean_markdown.



32
33
34
35
# File 'lib/rubycrawl/site_crawler.rb', line 32

def clean_markdown
  source = clean_html.empty? ? html : clean_html
  @clean_markdown ||= MarkdownConverter.convert(source, base_url: final_url)
end

#clean_textObject

Plain text derived from noise-stripped HTML. Lazy — same as Result#clean_text.



24
25
26
27
28
29
# File 'lib/rubycrawl/site_crawler.rb', line 24

def clean_text
  @clean_text ||= Result.new(
    html: html, raw_text: raw_text, clean_html: clean_html,
    links: links, metadata: 
  ).clean_text
end

#final_urlObject

The final URL after redirects.



38
39
40
# File 'lib/rubycrawl/site_crawler.rb', line 38

def final_url
  ['final_url'] || url
end