Module: CxExtractor

Extended by:
Chart, Utils
Defined in:
lib/cx_extractor.rb,
lib/cx_extractor/chart.rb,
lib/cx_extractor/utils.rb,
lib/cx_extractor/config.rb,
lib/cx_extractor/version.rb

Overview

configure

Defined Under Namespace

Modules: Chart, Utils

Constant Summary collapse

TITLE_REGEXP =
%r{<title>(.*?)</title>}.freeze
DEFAULTS =
{
  threshold: 86,
  balck_width: 3,
  explore_parent: true,
  chart_distribution: false,
  chart_file_name: 'distribution.png',
  chart_theme: {
    marker_color: '#AEA9A9',
    font_color: 'black',
    background_colors: 'white'
  }
}.freeze
VERSION =
'0.1.2'.freeze

Class Attribute Summary collapse

Class Method Summary collapse

Methods included from Chart

cal_color, cal_labels, chart, gruff_line

Methods included from Utils

find_dive, find_surge, get_clean_text, line_block_distribute, replace_special_char

Class Attribute Details

.optionsObject



17
18
19
# File 'lib/cx_extractor/config.rb', line 17

def options
  @options ||= DEFAULTS.dup
end

Class Method Details

.article(html) ⇒ Object



15
16
17
18
19
20
21
22
23
# File 'lib/cx_extractor.rb', line 15

def article(html)
  ctext = get_clean_text(html)
  lines = ctext.split("\n").map(&:strip)
  block_distribution = line_block_distribute(lines)
  content = get_content(lines, block_distribution)
  content = get_content_by_tag(html, content) if explore_parent
  # content.gsub("\n",'') if remove_newline
  content.squeeze.strip
end

.configure {|_self| ... } ⇒ Object

Yields:

  • (_self)

Yield Parameters:

  • _self (CxExtractor)

    the object that the method was called on



23
24
25
# File 'lib/cx_extractor/config.rb', line 23

def configure
  yield self
end

.get_contect_block(block_distribution, to_line) ⇒ Object



48
49
50
51
52
# File 'lib/cx_extractor.rb', line 48

def get_contect_block(block_distribution, to_line)
  from_line = find_surge(block_distribution, to_line, threshold)
  to_line = find_dive(block_distribution, from_line)
  [from_line, to_line]
end

.get_content(lines, block_distribution) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# File 'lib/cx_extractor.rb', line 30

def get_content(lines, block_distribution)
  from_line = to_line = 0
  content = chart_points = []
  loop do
    from_line, to_line = get_contect_block(block_distribution, to_line)
    content += lines[from_line..to_line]
    break if from_line < 0

    chart_points += [from_line, to_line]
  end
  if chart_distribution && !chart_points.empty?
    chart(block_distribution, chart_points)
  else
    warn 'there is no content for the web page, cannot chart'
  end
  content.join("\n")
end

.get_content_by_tag(html, block_content) ⇒ Object



54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/cx_extractor.rb', line 54

def get_content_by_tag(html, block_content)
  doc =  Nokogiri::HTML(html)
  p_doms = doc.css('p')
  ptext = []
  p_doms.each do |p_dom|
    ptext << p_dom.parent if block_content.include?(p_dom.text)
  end
  max_p = ptext.max_by { |i| ptext.count(i) }
  get_clean_text(max_p.to_s).split("\n").map(&:strip).join(
    "\n"
  ).squeeze
end

.get_title(html) ⇒ Object



25
26
27
28
# File 'lib/cx_extractor.rb', line 25

def get_title(html)
  matcher = TITLE_REGEXP.match(html) || []
  matcher[1]
end