18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
|
# File 'lib/pluto/feedfetcher/cond_get_with_cache.rb', line 18
def fetch( feed_rec )
feed_url = feed_rec.feed_url
feed_key = feed_rec.key
@worker.use_cache = true
@worker.cache[ feed_url ] = {
'etag' => feed_rec.http_etag,
'last-modified' => feed_rec.http_last_modified
}
begin
response = @worker.get( feed_url )
rescue SocketError, SystemCallError => e
logger.error "*** error: fetching feed '#{feed_key}' - [#{e.class.name}] #{e.to_s}"
Activity.create!( text: "*** error: fetching feed '#{feed_key}' - [#{e.class.name}] #{e.to_s}" )
@worker.use_cache = false return nil
end
@worker.use_cache = false
if response.code == '304' logger.info "OK - fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
logger.info "no change; request returns not modified (304); skipping parsing feed"
return nil end
feed_fetched = Time.now
if response.code != '200'
logger.error "*** error: fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
feed_attribs = {
http_code: response.code.to_i,
http_server: response.[ 'server' ],
http_etag: nil,
http_last_modified: nil,
body: nil,
md5: nil,
fetched: feed_fetched
}
feed_rec.update!( feed_attribs )
Activity.create!( text: "*** error: fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}" )
return nil end
logger.info "OK - fetching feed '#{feed_key}' - HTTP status #{response.code} #{response.message}"
feed_xml = response.body
logger.debug "feed_xml.encoding.name (before): #{feed_xml.encoding.name}"
begin
feed_xml_cleaned = feed_xml.dup.force_encoding( Encoding::UTF_8 )
unless feed_xml_cleaned.valid_encoding?
logger.warn "*** warn: feed '#{feed_key}' charset encoding not valid utf8 - trying latin1"
Activity.create!( text: "*** warn: feed '#{feed_key}' charset encoding not valid utf8 - trying latin1" )
feed_xml_cleaned = feed_xml.dup.force_encoding( Encoding::ISO_8859_1 ).encode( Encoding::UTF_8 )
end
feed_xml = feed_xml_cleaned
rescue EncodingError => e
logger.warn "*** warn: feed '#{feed_key}' charset encoding to utf8 failed; throwing out invalid bits - #{e.to_s}"
Activity.create!( text: "*** warn: feed '#{feed_key}' charset encoding to utf8 failed; throwing out invalid bits - #{e.to_s}" )
feed_xml.encode!( Encoding::UTF_8, :invalid => :replace, :undef => :replace )
end
logger.debug "feed_xml.encoding.name (after): #{feed_xml.encoding.name}"
last_feed_md5 = feed_rec.md5
feed_md5 = Digest::MD5.hexdigest( feed_xml )
if last_feed_md5 && last_feed_md5 == feed_md5
logger.info "no change; md5 digests match; skipping parsing feed"
return nil end
feed_attribs = {
http_code: response.code.to_i,
http_server: response.[ 'server' ],
http_etag: response.[ 'etag' ],
http_last_modified: response.[ 'last-modified' ], body: feed_xml,
md5: feed_md5,
fetched: feed_fetched
}
logger.debug "http header - server: #{response.['server']} - #{response.['server'].class.name}"
logger.debug "http header - etag: #{response.['etag']} - #{response.['etag'].class.name}"
logger.debug "http header - last-modified: #{response.['last-modified']} - #{response.['last-modified'].class.name}"
begin
feed_rec.update!( feed_attribs )
rescue Exception => e
logger.error "*** error: updating feed database record '#{feed_key}' - #{e.to_s}"
Activity.create!( text: "*** error: updating feed database record '#{feed_key}' - #{e.to_s}" )
return nil end
logger.debug "feed_xml:"
logger.debug feed_xml[ 0..300 ]
logger.info "Before parsing feed >#{feed_key}<..."
feed_xml
end
|