Module: Rsssf::Filters

Included in:
PageFetcher, Repo
Defined in:
lib/rsssf/html2txt.rb

Instance Method Summary collapse

Instance Method Details

#html_to_txt(html) ⇒ Object



6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# File 'lib/rsssf/html2txt.rb', line 6

def html_to_txt( html )

###
#   todo: check if any tags (still) present??


  ## cut off everything before body
  html = html.sub( /.+?<BODY>\s*/im, '' )

  ## cut off everything after body (closing)
  html = html.sub( /<\/BODY>.*/im, '' )


  ## remove cite
  html = html.gsub( /<CITE>([^<]+)<\/CITE>/im ) do |_|
    puts " remove cite >#{$1}<"
    "#{$1}"
  end

  html = html.gsub( /\s*<HR>\s*/im ) do |match|
    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
    puts " replace horizontal rule (hr) - >#{match}<"
    "\n=-=-=-=-=-=-=-=-=-=-=-=-=-=-=\n"    ## check what hr to use use  - . - . - or =-=-=-= or somehting distinct?
  end

  ## replace break (br)
  ## note: do NOT use m/multiline for now - why? why not??
  html = html.gsub( /<BR>\s*/i ) do |match|    ## note: include (swallow) "extra" newline
    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
    puts " replace break (br) - >#{match}<"
    "\n"
  end

  ## remove anchors (a name)
  html = html.gsub( /<A NAME[^>]*>(.+?)<\/A>/im ) do |match|   ## note: use .+? non-greedy match
    title = $1.to_s   ## note: "save" caputure first; gets replaced by gsub (next regex call)
    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
    puts " replace anchor (a) name >#{title}< - >#{match}<"
    "#{title}"
  end

  ## remove anchors (a href)
  #    note: heading 4 includes anchor (thus, let anchors go first)
  #  note: <a \newline href is used for authors email - thus incl. support for newline as space
  html = html.gsub( /<A\s+HREF[^>]*>(.+?)<\/A>/im ) do |_|   ## note: use .+? non-greedy match
    puts " replace anchor (a) href >#{$1}<"
    "#{$1}"
  end

  ## replace paragrah (p)
  html = html.gsub( /\s*<P>\s*/im ) do |match|    ## note: include (swallow) "extra" newline
    match = match.gsub( "\n", '$$' )  ## make newlines visible for debugging
    puts " replace paragraph (p) - >#{match}<"
    "\n\n"
  end
  html = html.gsub( /<\/P>/i, '' )  ## replace paragraph (p) closing w/ nothing for now

  ## remove i
  html = html.gsub( /<I>([^<]+)<\/I>/im ) do |_|
    puts " remove italic (i) >#{$1}<"
    "#{$1}"
  end


  ## heading 2
  html = html.gsub( /\s*<H2>([^<]+)<\/H2>\s*/im ) do |_|
    puts " replace heading 2 (h2) >#{$1}<"
    "\n\n## #{$1}\n\n"    ## note: make sure to always add two newlines
  end

  ## heading 4
  html = html.gsub( /\s*<H4>([^<]+)<\/H4>\s*/im ) do |_|
    puts " replace heading 4 (h4) >#{$1}<"
    "\n\n#### #{$1}\n\n"    ## note: make sure to always add two newlines
  end


  ## remove b   - note: might include anchors (thus, call after anchors)
  html = html.gsub( /<B>([^<]+)<\/B>/im ) do |_|
    puts " remove bold (b) >#{$1}<"
    "**#{$1}**"
  end

  ## replace preformatted (pre)
  html = html.gsub( /<PRE>|<\/PRE>/i ) do |_|
    puts " replace preformatted (pre)"
    ''  # replace w/ nothing for now (keep surrounding newlines)
  end

=begin
  puts
  puts
  puts "html:"
  puts html[0..2000]
  puts "-- snip --"
  puts html[-1000..-1]   ## print last hundred chars
=end


  ## cleanup whitespaces
  ##   todo/fix:  convert newline in space first
  ##                and than collapse spaces etc.!!!
  txt = ''
  html.each_line do |line|
     line = line.gsub( "\t", '  ' ) # replace all tabs w/ two spaces for nwo
     line = line.rstrip             # remove trailing whitespace (incl. newline/formfeed)

     txt << line
     txt << "\n"
  end

  ### remove emails etc.
  txt = sanitize( txt )

  txt
end

#sanitize(txt) ⇒ Object



125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/rsssf/html2txt.rb', line 125

def sanitize( txt )
  ### remove emails for (spam/privacy) protection
  ## e.g. ([email protected])
  ##      ([email protected])
  ##      ([email protected])
  ##      ([email protected])

  ##   note add support for optional ‹› enclosure (used by html2txt converted a href :mailto links)
  ##   e.g. (‹[email protected]›)

  email_pattern = "\\(‹?[a-z][a-z0-9_]+@[a-z]+(\\.[a-z]+)+›?\\)"   ## note: just a string; needs to escape \\ twice!!!

  ## check for "free-standing e.g. on its own line" emails only for now
  txt = txt.gsub( /\n#{email_pattern}\n/i ) do |match|
    puts "removing (free-standing) email >#{match}<"
    "\n"   # return empty line
  end

  txt = txt.gsub( /#{email_pattern}/i ) do |match|
    puts "remove email >#{match}<"
    ''
  end

  txt  
end