Class: RubyCrawl::SiteCrawler::PageResult

Inherits:
Object
  • Object
show all
Defined in:
lib/rubycrawl/site_crawler.rb

Overview

Page result yielded to the block with lazy clean_markdown.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(url:, html:, raw_text:, clean_html:, links:, metadata:, depth:) ⇒ PageResult

Returns a new instance of PageResult.



12
13
14
15
16
17
18
19
20
# File 'lib/rubycrawl/site_crawler.rb', line 12

def initialize(url:, html:, raw_text:, clean_html:, links:, metadata:, depth:)
  @url        = url
  @html       = html
  @raw_text   = raw_text
  @clean_html = clean_html
  @links      = links
     = 
  @depth      = depth
end

Instance Attribute Details

#clean_htmlObject (readonly)

Returns the value of attribute clean_html.



10
11
12
# File 'lib/rubycrawl/site_crawler.rb', line 10

def clean_html
  @clean_html
end

#depthObject (readonly)

Returns the value of attribute depth.



10
11
12
# File 'lib/rubycrawl/site_crawler.rb', line 10

def depth
  @depth
end

#htmlObject (readonly)

Returns the value of attribute html.



10
11
12
# File 'lib/rubycrawl/site_crawler.rb', line 10

def html
  @html
end

Returns the value of attribute links.



10
11
12
# File 'lib/rubycrawl/site_crawler.rb', line 10

def links
  @links
end

#metadataObject (readonly)

Returns the value of attribute metadata.



10
11
12
# File 'lib/rubycrawl/site_crawler.rb', line 10

def 
  
end

#raw_textObject (readonly)

Returns the value of attribute raw_text.



10
11
12
# File 'lib/rubycrawl/site_crawler.rb', line 10

def raw_text
  @raw_text
end

#urlObject (readonly)

Returns the value of attribute url.



10
11
12
# File 'lib/rubycrawl/site_crawler.rb', line 10

def url
  @url
end

Instance Method Details

#clean_markdownObject

Markdown derived from noise-stripped HTML. Lazy — same as Result#clean_markdown.



31
32
33
34
# File 'lib/rubycrawl/site_crawler.rb', line 31

def clean_markdown
  source = clean_html.empty? ? html : clean_html
  @clean_markdown ||= MarkdownConverter.convert(source, base_url: final_url)
end

#clean_textObject

Plain text derived from noise-stripped HTML. Lazy — same as Result#clean_text.



23
24
25
26
27
28
# File 'lib/rubycrawl/site_crawler.rb', line 23

def clean_text
  @clean_text ||= Result.new(
    html: html, raw_text: raw_text, clean_html: clean_html,
    links: links, metadata: 
  ).clean_text
end

#final_urlObject

The final URL after redirects.



37
38
39
# File 'lib/rubycrawl/site_crawler.rb', line 37

def final_url
  ['final_url'] || url
end