Class: Pidgin2Adium::HtmlLogParser

Inherits:
BasicParser show all
Defined in:
lib/pidgin2adium/parsers/html_log_parser.rb

Constant Summary

Constants inherited from BasicParser

BasicParser::MINIMAL_TIME_REGEX, BasicParser::TIME_REGEX, BasicParser::TIME_REGEX_FIRST_LINE

Constants included from Pidgin2Adium

ADIUM_LOG_DIR, BAD_DIRS, FILE_EXISTS, VERSION

Instance Method Summary collapse

Methods inherited from BasicParser

#create_adium_time, #create_msg, #create_status_or_event_msg, #force_conversion?, #get_sender_by_alias, #is_minimal_time?, #parse, #pre_parse!, #printed_conversion_error!, #printed_conversion_error?, #strptime, #try_to_parse_minimal_time, #try_to_parse_time, #try_to_parse_time_with_formats

Methods included from Pidgin2Adium

balance_tags_c, delete_search_indexes, error, log_msg, oops, parse, parse_and_generate

Constructor Details

#initialize(src_path, user_aliases, force_conversion = false) ⇒ HtmlLogParser

Returns a new instance of HtmlLogParser.



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# File 'lib/pidgin2adium/parsers/html_log_parser.rb', line 8

def initialize(src_path, user_aliases, force_conversion = false)
  super(src_path, user_aliases, force_conversion)
  @timestamp_rx = '\(((?:\d{4}-\d{2}-\d{2} )?\d{1,2}:\d{1,2}:\d{1,2}(?: [AP]M)?)\)'

  # @line_regex matches a line in an HTML log file other than the
  # first time matches on either "2008-11-17 14:12" or "14:12"
  # @line_regex match obj:
  # 0: timestamp, extended or not
  # 1: screen name or alias, if alias set
  # 2: "<AUTO-REPLY>" or nil
  # 3: message body
  # The ":" is optional to allow for strings like "(17:12:21) <b>***Gabe B-W</b> is confused<br/>"
  @line_regex = /#{@timestamp_rx} ?<b>(.+?) ?(&lt;AUTO-REPLY&gt;)?:?<\/b> ?(.+)<br ?\/>/o
  # @line_regex_status matches a status line
  # @line_regex_status match obj:
  # 0: timestamp
  # 1: status message
  @line_regex_status = /#{@timestamp_rx} ?<b> (.+)<\/b><br ?\/>/o
end

Instance Method Details

#cleanup(text) ⇒ Object

Returns a cleaned string. Removes the following tags from text:

  • html

  • body

  • font

  • a with no innertext, e.g. <a href=“blah”></a>

And removes the following style declarations:

  • color: #000000 (just turns text black)

  • font-family

  • font-size

  • background

  • em (really it’s changed to <span style=“font-style: italic;”>)

Since each <span> has only one style declaration, spans with these declarations are removed (but the text inside them is preserved).



42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
# File 'lib/pidgin2adium/parsers/html_log_parser.rb', line 42

def cleanup(text)
  # Sometimes this is in there. I don't know why.
  text.gsub!(%r{&lt;/FONT HSPACE='\d'>}, '')
  # We can remove <font> safely since Pidgin and Adium both show bold
  # using <span style="font-weight: bold;"> except Pidgin uses single
  # quotes while Adium uses double quotes.
  text.gsub!(/<\/?(?:html|body|font)(?: .+?)?>/, '') # very important!

  text.tr!("\r", '')
  # Remove empty lines
  text.gsub!("\n\n", "\n")

  # Remove newlines that end the file, since they screw up the
  # newline -> <br/> conversion
  text.gsub!(/\n\Z/, '')

  # Replace newlines with "<br/>" unless they end a chat line.
  # This must go after we remove <font> tags.
  text.gsub!(/\n(?!#{@timestamp_rx})/, '<br/>')

  # These empty links are sometimes appended to every line in a chat,
  # for some weird reason. Remove them.
  text.gsub!(%r{<a href=['"].+?['"]>\s*?</a>}, '')

  # Replace single quotes inside tags with double quotes so we can
  # easily change single quotes to entities.
  # For spans, removes a space after the final declaration if it exists.
  text.gsub!(/<span style='([^']+?;) ?'>/, '<span style="\1">')
  text.gsub!(/([a-z]+=)'(.+?)'/, '\1"\2"')
=begin
  text.gsub!(/<a href='(.+?)'>/, '<a href="\1">')
  text.gsub!(/<img src='([^']+?)'/, '<img src="\1"')
  text.gsub!(/ alt='([^']+?)'/, ' alt="\1"')
=end
  text.gsub!("'", '&apos;')

  # This actually does match stuff, but doesn't group it correctly. :(
  # text.gsub!(%r{<span style="((?:.+?;)+)">(.*?)</span>}) do |s|
  text.gsub!(%r{<span style="(.+?)">(.*?)</span>}) do |s|
    # Remove empty spans.
    next if $2 == ''

    # style = style declaration
    # innertext = text inside <span>
    style, innertext = $1, $2
    # TODO: replace double quotes with "&quot;", but only outside tags; may still be tags inside spans
    # innertext.gsub!("")

    styleparts = style.split(/; ?/)
    styleparts.map! do |p|
      if p[0,5] == 'color'
        if p.include?('color: #000000')
          next
        elsif p =~ /(color: #[0-9a-fA-F]{6})(>.*)?/
          # Regarding the bit with the ">", sometimes this happens:
          # <span style="color: #000000>today;">today was busy</span>
          # Then p = "color: #000000>today"
          # Or it can end in ">;", with no text before the semicolon.
          # So keep the color but remove the ">" and anything following it.
          next($1)
        end
      else
        # don't remove font-weight
        case p
        when /^font-family/ then next
        when /^font-size/ then next
        when /^background/ then next
        end
      end
    end.compact!
    unless styleparts.empty?
      style = styleparts.join('; ')
      innertext = "<span style=\"#{style};\">#{innertext}</span>"
    end
    innertext
  end
  # Pidgin uses <em>, Adium uses <span>
  if text.gsub!('<em>', '<span style="font-style: italic;">')
    text.gsub!('</em>', '</span>')
  end
  return text
end