Class: Datahen::Scraper::RubyParserExecutor
- Inherits:
-
Executor
- Object
- Executor
- Datahen::Scraper::RubyParserExecutor
show all
- Defined in:
- lib/datahen/scraper/ruby_parser_executor.rb
Constant Summary
collapse
- FIND_OUTPUTS_RETRY_LIMIT =
nil
Constants inherited
from Executor
Executor::MAX_FIND_OUTPUTS_PER_PAGE
Instance Attribute Summary collapse
Attributes inherited from Executor
#filename, #gid, #job_id, #page
Class Method Summary
collapse
Instance Method Summary
collapse
Methods inherited from Executor
#clean_backtrace, #eval_with_context, #find_output, #find_outputs, #finish, #finisher_update, #get_content, #get_failed_content, #get_job_id, #init_global_page, #init_job_page, #init_page, #parsing_update, #remove_old_dups!, #remove_old_output_dups!, #remove_old_page_dups!, #save_outputs, #save_pages, #save_pages_and_outputs, #seeding_update
#create_context, #expose_to, #exposed_env, #exposed_methods, #isolated_binding, #var_or_proc
Constructor Details
Returns a new instance of RubyParserExecutor.
19
20
21
22
23
24
25
26
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 19
def initialize(options={})
@filename = options.fetch(:filename) { raise "Filename is required"}
@page = options.fetch(:page) { nil }
@gid = (self.page || {})['gid'] || options.fetch(:gid) { raise "GID or a page with a GID is required"}
@job_id = options.fetch(:job_id)
@page_vars = options.fetch(:vars) { {} }
@keep_outputs = !!(options.fetch(:keep_outputs) { false })
end
|
Instance Attribute Details
#limbo_self ⇒ Boollean
15
16
17
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 15
def limbo_self
@limbo_self
end
|
#refetch_self ⇒ Boollean
Note:
It is stronger than #reparse_self flag.
Refetch self page flag.
8
9
10
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 8
def refetch_self
@refetch_self
end
|
#reparse_self ⇒ Boollean
Note:
It is stronger than #limbo_self flag.
Reparse self page flag.
12
13
14
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 12
def reparse_self
@reparse_self
end
|
#save ⇒ Object
Returns the value of attribute save.
4
5
6
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 4
def save
@save
end
|
Class Method Details
.exposed_methods ⇒ Object
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 28
def self.exposed_methods
[
:get_content,
:get_failed_content,
:content,
:failed_content,
:outputs,
:pages,
:page,
:save_pages,
:save_outputs,
:find_output,
:find_outputs,
:refetch,
:reparse,
:limbo,
:finish
].freeze
end
|
Instance Method Details
#content ⇒ Object
219
220
221
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 219
def content
@content ||= get_content(job_id, gid)
end
|
#eval_parser_script(save = false) ⇒ Object
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 169
def eval_parser_script(save=false)
update_parsing_starting_status
proc = Proc.new do
page = init_page
outputs = []
pages = []
page = init_page_vars(page)
self.refetch_self = false
self.reparse_self = false
self.limbo_self = false
begin
context = isolated_binding({
outputs: outputs,
pages: pages,
page: page
})
eval_with_context filename, context
rescue Error::SafeTerminateError => e
rescue SyntaxError => e
handle_error(e) if save
raise e
rescue => e
handle_error(e) if save
raise e
end
puts "=========== Parsing Executed ==========="
begin
save_pages_and_outputs(pages, outputs, :parsing) unless refetch_self
rescue => e
handle_error(e) if save
raise e
end
if refetch_self
refetch_page gid
elsif reparse_self
reparse_page gid
elsif limbo_self
limbo_page gid
else
update_parsing_done_status
end
end
proc.call
end
|
#exec_parser(save = false) ⇒ Object
48
49
50
51
52
53
54
55
56
57
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 48
def exec_parser(save=false)
@save = save
if save
puts "Executing parser script"
else
puts "Trying parser script"
end
eval_parser_script(save)
end
|
#failed_content ⇒ Object
223
224
225
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 223
def failed_content
@failed_content ||= get_failed_content(job_id, gid)
end
|
#handle_error(e) ⇒ Object
227
228
229
230
231
232
233
234
235
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 227
def handle_error(e)
error = ["Parsing #{e.class}: #{e.to_s} (Job:#{job_id} GID:#{gid})",clean_backtrace(e.backtrace)].join("\n")
parsing_update(
job_id: job_id,
gid: gid,
parsing_status: :failed,
log_error: error)
end
|
#init_page_vars(page) ⇒ Object
59
60
61
62
63
64
65
66
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 59
def init_page_vars(page)
return self.page unless self.page.nil?
if !@page_vars.nil? && !@page_vars.empty?
page['vars'] = @page_vars
end
page
end
|
#limbo(page_gid) ⇒ Object
160
161
162
163
164
165
166
167
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 160
def limbo page_gid
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
if page_gid == gid
self.limbo_self = true
raise Error::SafeTerminateError
end
limbo_page page_gid
end
|
#limbo_page(gid) ⇒ Object
151
152
153
154
155
156
157
158
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 151
def limbo_page gid
if save
Client::JobPage.new({gid: gid}).limbo(self.job_id)
puts "Limbo page #{gid}"
else
puts "Would have limbo page #{gid}"
end
end
|
#refetch(page_gid) ⇒ Object
124
125
126
127
128
129
130
131
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 124
def refetch page_gid
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
if page_gid == gid
self.refetch_self = true
raise Error::SafeTerminateError
end
refetch_page page_gid
end
|
#refetch_page(gid) ⇒ Object
115
116
117
118
119
120
121
122
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 115
def refetch_page gid
if save
Client::JobPage.new({gid: gid}).refetch(self.job_id)
puts "Refetch page #{gid}"
else
puts "Would have refetch page #{gid}"
end
end
|
#reparse(page_gid) ⇒ Object
142
143
144
145
146
147
148
149
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 142
def reparse page_gid
raise ArgumentError.new("page_gid needs to be a String.") unless page_gid.is_a?(String)
if page_gid == gid
self.reparse_self = true
raise Error::SafeTerminateError
end
reparse_page page_gid
end
|
#reparse_page(gid) ⇒ Object
133
134
135
136
137
138
139
140
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 133
def reparse_page gid
if save
Client::JobPage.new({gid: gid}).reparse(self.job_id)
puts "Reparse page #{gid}"
else
puts "Would have reparse page #{gid}"
end
end
|
#save_type ⇒ Object
111
112
113
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 111
def save_type
:parsing
end
|
#update_parsing_done_status ⇒ Object
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 95
def update_parsing_done_status
return unless save
response = parsing_update(
job_id: job_id,
gid: gid,
parsing_status: :done)
if response.code == 200
puts "Page Parsing Done."
else
puts "Error: Unable to save Page Parsing Done Status to server: #{response.body}"
raise "Unable to save Page Parsing Done Status to server: #{response.body}"
end
end
|
#update_parsing_starting_status ⇒ Object
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 77
def update_parsing_starting_status
return unless save
response = parsing_update(
job_id: job_id,
gid: gid,
parsing_status: :starting,
keep_outputs: @keep_outputs
)
if response.code == 200
puts "Page Parsing Status Updated."
else
puts "Error: Unable to save Page Parsing Status to server: #{response.body}"
raise "Unable to save Page Parsing Status to server: #{response.body}"
end
end
|
#update_to_server(opts = {}) ⇒ Object
68
69
70
71
72
73
74
75
|
# File 'lib/datahen/scraper/ruby_parser_executor.rb', line 68
def update_to_server(opts = {})
parsing_update(
job_id: opts[:job_id],
gid: opts[:gid],
pages: opts[:pages],
outputs: opts[:outputs],
parsing_status: opts[:status])
end
|