Module: CxExtractor
- Extended by:
- Chart, Utils
- Defined in:
- lib/cx_extractor.rb,
lib/cx_extractor/chart.rb,
lib/cx_extractor/utils.rb,
lib/cx_extractor/config.rb,
lib/cx_extractor/version.rb
Overview
Defined Under Namespace
Modules: Chart, Utils
Constant Summary
collapse
- TITLE_REGEXP =
%r{<title>(.*?)</title>}.freeze
- DEFAULTS =
{
threshold: 86,
balck_width: 3,
explore_parent: true,
chart_distribution: false,
chart_file_name: 'distribution.png',
chart_theme: {
marker_color: '#AEA9A9',
font_color: 'black',
background_colors: 'white'
}
}.freeze
- VERSION =
'0.1.2'.freeze
Class Attribute Summary collapse
Class Method Summary
collapse
Methods included from Chart
cal_color, cal_labels, chart, gruff_line
Methods included from Utils
find_dive, find_surge, get_clean_text, line_block_distribute, replace_special_char
Class Attribute Details
.options ⇒ Object
17
18
19
|
# File 'lib/cx_extractor/config.rb', line 17
def options
@options ||= DEFAULTS.dup
end
|
Class Method Details
.article(html) ⇒ Object
15
16
17
18
19
20
21
22
23
|
# File 'lib/cx_extractor.rb', line 15
def article(html)
ctext = get_clean_text(html)
lines = ctext.split("\n").map(&:strip)
block_distribution = line_block_distribute(lines)
content = get_content(lines, block_distribution)
content = get_content_by_tag(html, content) if explore_parent
content.squeeze.strip
end
|
23
24
25
|
# File 'lib/cx_extractor/config.rb', line 23
def configure
yield self
end
|
.get_contect_block(block_distribution, to_line) ⇒ Object
48
49
50
51
52
|
# File 'lib/cx_extractor.rb', line 48
def get_contect_block(block_distribution, to_line)
from_line = find_surge(block_distribution, to_line, threshold)
to_line = find_dive(block_distribution, from_line)
[from_line, to_line]
end
|
.get_content(lines, block_distribution) ⇒ Object
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
# File 'lib/cx_extractor.rb', line 30
def get_content(lines, block_distribution)
from_line = to_line = 0
content = chart_points = []
loop do
from_line, to_line = get_contect_block(block_distribution, to_line)
content += lines[from_line..to_line]
break if from_line < 0
chart_points += [from_line, to_line]
end
if chart_distribution && !chart_points.empty?
chart(block_distribution, chart_points)
else
warn 'there is no content for the web page, cannot chart'
end
content.join("\n")
end
|
.get_content_by_tag(html, block_content) ⇒ Object
54
55
56
57
58
59
60
61
62
63
64
65
|
# File 'lib/cx_extractor.rb', line 54
def get_content_by_tag(html, block_content)
doc = Nokogiri::HTML(html)
p_doms = doc.css('p')
ptext = []
p_doms.each do |p_dom|
ptext << p_dom.parent if block_content.include?(p_dom.text)
end
max_p = ptext.max_by { |i| ptext.count(i) }
get_clean_text(max_p.to_s).split("\n").map(&:strip).join(
"\n"
).squeeze
end
|
.get_title(html) ⇒ Object
25
26
27
28
|
# File 'lib/cx_extractor.rb', line 25
def get_title(html)
matcher = TITLE_REGEXP.match(html) || []
matcher[1]
end
|