Module: CxExtractor

Extended by:: Chart, Utils

Defined in:: lib/cx_extractor.rb,
lib/cx_extractor/chart.rb,
lib/cx_extractor/utils.rb,
lib/cx_extractor/config.rb,
lib/cx_extractor/version.rb

Overview

configure

Defined Under Namespace

Modules: Chart, Utils

Constant Summary collapse

TITLE_REGEXP =

%r{<title>(.*?)</title>}.freeze

DEFAULTS =

{
  threshold: 86,
  balck_width: 3,
  explore_parent: true,
  chart_distribution: false,
  chart_file_name: 'distribution.png',
  chart_theme: {
    marker_color: '#AEA9A9',
    font_color: 'black',
    background_colors: 'white'
  }
}.freeze

VERSION =

'0.1.2'.freeze

Class Attribute Summary collapse

.options ⇒ Object

Class Method Summary collapse

Methods included from Chart

cal_color, cal_labels, chart, gruff_line

Methods included from Utils

find_dive, find_surge, get_clean_text, line_block_distribute, replace_special_char

Class Attribute Details

.options ⇒ `Object`



17
18
19

# File 'lib/cx_extractor/config.rb', line 17

def options
  @options ||= DEFAULTS.dup
end

Class Method Details

.article(html) ⇒ `Object`

# File 'lib/cx_extractor.rb', line 15

def article(html)
  ctext = get_clean_text(html)
  lines = ctext.split("\n").map(&:strip)
  block_distribution = line_block_distribute(lines)
  content = get_content(lines, block_distribution)
  content = get_content_by_tag(html, content) if explore_parent
  # content.gsub("\n",'') if remove_newline
  content.squeeze.strip
end

.configure {|_self| ... } ⇒ `Object`

Yields:

(_self)

Yield Parameters:

_self (CxExtractor) —

the object that the method was called on



23
24
25

# File 'lib/cx_extractor/config.rb', line 23

def configure
  yield self
end

.get_contect_block(block_distribution, to_line) ⇒ `Object`

# File 'lib/cx_extractor.rb', line 48

def get_contect_block(block_distribution, to_line)
  from_line = find_surge(block_distribution, to_line, threshold)
  to_line = find_dive(block_distribution, from_line)
  [from_line, to_line]
end

.get_content(lines, block_distribution) ⇒ `Object`

# File 'lib/cx_extractor.rb', line 30

def get_content(lines, block_distribution)
  from_line = to_line = 0
  content = chart_points = []
  loop do
    from_line, to_line = get_contect_block(block_distribution, to_line)
    content += lines[from_line..to_line]
    break if from_line < 0

    chart_points += [from_line, to_line]
  end
  if chart_distribution && !chart_points.empty?
    chart(block_distribution, chart_points)
  else
    warn 'there is no content for the web page, cannot chart'
  end
  content.join("\n")
end

.get_content_by_tag(html, block_content) ⇒ `Object`

# File 'lib/cx_extractor.rb', line 54

def get_content_by_tag(html, block_content)
  doc =  Nokogiri::HTML(html)
  p_doms = doc.css('p')
  ptext = []
  p_doms.each do |p_dom|
    ptext << p_dom.parent if block_content.include?(p_dom.text)
  end
  max_p = ptext.max_by { |i| ptext.count(i) }
  get_clean_text(max_p.to_s).split("\n").map(&:strip).join(
    "\n"
  ).squeeze
end

.get_title(html) ⇒ `Object`

# File 'lib/cx_extractor.rb', line 25

def get_title(html)
  matcher = TITLE_REGEXP.match(html) || []
  matcher[1]
end

Module: CxExtractor

Overview

Defined Under Namespace

Constant Summary collapse

Class Attribute Summary collapse

Class Method Summary collapse

Methods included from Chart

Methods included from Utils

Class Attribute Details

.options ⇒ Object

Class Method Details

.article(html) ⇒ Object

.configure {|_self| ... } ⇒ Object

.get_contect_block(block_distribution, to_line) ⇒ Object

.get_content(lines, block_distribution) ⇒ Object

.get_content_by_tag(html, block_content) ⇒ Object

.get_title(html) ⇒ Object

.options ⇒ `Object`

.article(html) ⇒ `Object`

.configure {|_self| ... } ⇒ `Object`

.get_contect_block(block_distribution, to_line) ⇒ `Object`

.get_content(lines, block_distribution) ⇒ `Object`

.get_content_by_tag(html, block_content) ⇒ `Object`

.get_title(html) ⇒ `Object`