Class: Websitary::Configuration

Inherits:
Object
  • Object
show all
Defined in:
lib/websitary/configuration.rb

Overview

This class defines the scope in which profiles are evaluated. Most of its methods are suitable for use in profiles.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(app, args = []) ⇒ Configuration

Returns a new instance of Configuration.



32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# File 'lib/websitary/configuration.rb', line 32

def initialize(app, args=[])
    @logger = Websitary::AppLog.new
    $logger.debug "Configuration#initialize"
    @app    = app
    @cfgdir = ENV['HOME'] ? File.join(ENV['HOME'], '.websitary') : '.'
    [
        ENV['USERPROFILE'] && File.join(ENV['USERPROFILE'], 'websitary'),
        File.join(Config::CONFIG['sysconfdir'], 'websitary')
    ].each do |dir|
        if File.exists?(dir)
            @cfgdir = dir
            break
        end
    end

    @cmd_edit          = 'vi "%s"'
    @execute           = 'downdiff'
    @quicklist_profile = 'quicklist'
    @user_agent        = "websitary/#{Websitary::VERSION}"
    @view              = 'w3m "%s"'

    @allow             = {}
    @default_options   = {}
    @default_profiles  = [@quicklist_profile]
    @done              = []
    @mtimes            = Websitary::FileMTimes.new(self)
    @options           = {}
    @outfile           = {}
    @profiles          = []
    @robots            = {}
    @todo              = []
    @exclude           = []
    @urlencmap         = {}
    @urls              = {}

    @suffix = {
        'text' => 'txt'
        # 'rss'  => 'xml'
    }

    migrate
    initialize_options
    profile 'config.rb'
    parse_command_line_args(args)

    @output_format   ||= ['html']
    @output_title      = %{#{Websitary::APPNAME}: #{@profiles.join(", ")}}
end

Instance Attribute Details

#cfgdirObject

The user configuration directory



19
20
21
# File 'lib/websitary/configuration.rb', line 19

def cfgdir
  @cfgdir
end

#doneObject

Array of downloaded urls.



17
18
19
# File 'lib/websitary/configuration.rb', line 17

def done
  @done
end

#executeObject

What to do



21
22
23
# File 'lib/websitary/configuration.rb', line 21

def execute
  @execute
end

#mtimesObject

Cached mtimes



25
26
27
# File 'lib/websitary/configuration.rb', line 25

def mtimes
  @mtimes
end

#optionsObject

Global Options



23
24
25
# File 'lib/websitary/configuration.rb', line 23

def options
  @options
end

#quicklist_profileObject

The name of the quicklist profile



27
28
29
# File 'lib/websitary/configuration.rb', line 27

def quicklist_profile
  @quicklist_profile
end

#todoObject (readonly)

Array of urls to be downloaded.



15
16
17
# File 'lib/websitary/configuration.rb', line 15

def todo
  @todo
end

#urlsObject

Hash (key = URL, value = Hash of options)



13
14
15
# File 'lib/websitary/configuration.rb', line 13

def urls
  @urls
end

Instance Method Details

#call_cmd(cmd, args, default = nil) ⇒ Object

Apply the argument to cmd (a format String or a Proc). If a String, execute the command.



496
497
498
# File 'lib/websitary/configuration.rb', line 496

def call_cmd(cmd, args, default=nil)
    eval_arg(cmd, args, default) {|cmd| `#{cmd}`}
end

#canonic_filename(filename) ⇒ Object



963
964
965
# File 'lib/websitary/configuration.rb', line 963

def canonic_filename(filename)
    call_cmd(get_optionvalue(:global, :canonic_filename), [filename], filename)        
end

#canonic_url(url) ⇒ Object

Strip the url’s last part (after #).



752
753
754
# File 'lib/websitary/configuration.rb', line 752

def canonic_url(url)
    url.sub(/#.*$/, '')
end

#default(*profile_names) ⇒ Object

Configuration command: Set the default profiles



281
282
283
# File 'lib/websitary/configuration.rb', line 281

def default(*profile_names)
    @default_profiles = profile_names
end

#diff(diff) ⇒ Object

Configuration command: Set the default diff program.



452
453
454
# File 'lib/websitary/configuration.rb', line 452

def diff(diff)
    @options[:diff][:default] = diff
end

#diffname(url, ensure_dir = false) ⇒ Object

Get the diff filename.



661
662
663
# File 'lib/websitary/configuration.rb', line 661

def diffname(url, ensure_dir=false)
    encoded_filename('diff', url, ensure_dir, 'md5')
end

#diffprocess(&block) ⇒ Object

Configuration command: Set the default diff processor. The block takes the diff text (STRING) as argument.



424
425
426
# File 'lib/websitary/configuration.rb', line 424

def diffprocess(&block)
    @options[:diff][:default] = block
end

#download(download) ⇒ Object

Configuration command: Set the default dowloader.



459
460
461
# File 'lib/websitary/configuration.rb', line 459

def download(download)
    @options[:download][:default] = download
end

#downloadprocess(&block) ⇒ Object

Configuration command: Set the default download processor. The block takes the downloaded text (STRING) as argument.



416
417
418
# File 'lib/websitary/configuration.rb', line 416

def downloadprocess(&block)
    @options[:downloadprocess][:default] = block
end

#edit(cmd) ⇒ Object

Configuration command: Set the editor.



431
432
433
# File 'lib/websitary/configuration.rb', line 431

def edit(cmd)
    @cmd_edit = cmd
end

#edit_profile(profile = nil) ⇒ Object



929
930
931
932
933
934
935
936
937
938
939
# File 'lib/websitary/configuration.rb', line 929

def edit_profile(profile=nil)
    profile ||= @profiles
    case profile
    when Array
        profile.each {|p| edit_profile p}
    else
        fn = profile_filename(profile)
        $logger.debug "edit: #{fn}"
        `#{@cmd_edit % fn}`
    end
end

#eligible_path?(url, path0, path) ⇒ Boolean

Check whether path is eligible on the basis of url or path0. This checks either for a :match option for url or the extensions of path0 and path.

Returns:

  • (Boolean)


784
785
786
787
788
789
790
791
# File 'lib/websitary/configuration.rb', line 784

def eligible_path?(url, path0, path)
    rx = get(url, :match)
    if rx
        return path =~ rx
    else
        return File.extname(path0) == File.extname(path)
    end
end

#encoded_basename(url, type = 'tree') ⇒ Object



708
709
710
711
712
713
714
715
716
# File 'lib/websitary/configuration.rb', line 708

def encoded_basename(url, type='tree')
    m = "encoded_basename_#{type}"
    if respond_to?(m)
        return send(m, url)
    else
        $logger.fatal "Unknown cache type: #{type}"
        exit 5
    end
end

#encoded_basename_flat(url) ⇒ Object



724
725
726
# File 'lib/websitary/configuration.rb', line 724

def encoded_basename_flat(url)
    encode(url)
end

#encoded_basename_md5(url) ⇒ Object



729
730
731
# File 'lib/websitary/configuration.rb', line 729

def encoded_basename_md5(url)
    Digest::MD5.hexdigest(url)
end

#encoded_basename_tree(url) ⇒ Object



719
720
721
# File 'lib/websitary/configuration.rb', line 719

def encoded_basename_tree(url)
    ensure_filename(encode(url, '/'))
end

#encoded_filename(dir, url, ensure_dir = false, type = nil) ⇒ Object



689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
# File 'lib/websitary/configuration.rb', line 689

def encoded_filename(dir, url, ensure_dir=false, type=nil)
    type ||= get(url, :cachetype, 'tree')
    $logger.debug "encoded_filename: type=#{type} url=#{url}"
    rv = File.join(@cfgdir, dir, encoded_basename(url, type))
    rd = File.dirname(rv)
    $logger.debug "encoded_filename: rv0=#{rv}"
    fm = get_optionvalue(:global, :filename_size, 255)
    rdok = !ensure_dir || @app.ensure_dir(rd, false)
    if !rdok or rv.size > fm or File.directory?(rv)
        # $logger.debug "Filename too long (:global=>:filename_size = #{fm}), try md5 encoded filename instead: #{url}"
        $logger.info "Can't use filename, try 'md5' instead: #{url}"
        rv = File.join(@cfgdir, dir, encoded_basename(url, :md5))
        rd = File.dirname(rv)
    end
    @urlencmap[rv] = url
    return rv
end

#eval_arg(format, args, default = nil, &process_string) ⇒ Object

Apply some arguments to a format.

format

String or Proc

args

Array of Arguments



474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
# File 'lib/websitary/configuration.rb', line 474

def eval_arg(format, args, default=nil, &process_string)
    case format
    when nil
        return default
    when Proc
        # $logger.debug "eval proc: #{format} #{args.inspect}" #DBG#
        $logger.debug "eval proc: #{format}/#{args.size}"
        return format.call(*args)
    else
        ca = format % args
        # $logger.debug "eval string: #{ca}" #DBG#
        if process_string
            return process_string.call(ca)
        else
            return ca
        end
    end
end

#exclude(*urls) ⇒ Object

Configuration command: Add URL-exclusion patterns (REGEXPs).



438
439
440
# File 'lib/websitary/configuration.rb', line 438

def exclude(*urls)
    @exclude += urls
end

#format(url, difftext) ⇒ Object

Format a diff according to URL’s source options.



465
466
467
468
# File 'lib/websitary/configuration.rb', line 465

def format(url, difftext)
    fmt = get(url, :format)
    eval_arg(fmt, [difftext], difftext)
end

#get(url, opt, default = nil) ⇒ Object

Retrieve an option for an url

url

String

opt

Symbol



196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# File 'lib/websitary/configuration.rb', line 196

def get(url, opt, default=nil)
    opts = @urls[url]
    unless opts
        $logger.debug "Non-registered URL: #{url}"
        return default
    end
    $logger.debug "get: opts=#{opts.inspect}"
    case opt
    when :diffprocess, :format
        opt_ = opts.has_key?(opt) ? opt : :diff
    else
        opt_ = opt
    end

    $logger.debug "get: opt=#{opt} opt_=#{opt_}"
    $logger.debug "get: #{opts[opt_]} #{opts[:use]}" if opts
    if opts.has_key?(opt_)
        val = opts[opt_]
    elsif opts.has_key?(:use)
        val = opts[:use]
    else
        val = nil
    end

    case val
    when nil
    when Symbol
        $logger.debug "get: val=#{val}"
        success, rv = get_option(opt, val)
        $logger.debug "get: #{success}, #{rv}"
        if success
            return rv
        end
    else
        $logger.debug "get: return val=#{val}"
        return val
    end
    unless default
        success, default1 = get_option(opt, :default)
        default = default1 if success
    end

    $logger.debug "get: return default=#{default}"
    return default
end

#get_option(opt, val) ⇒ Object



258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
# File 'lib/websitary/configuration.rb', line 258

def get_option(opt, val)
    vals = @options[opt]
    $logger.debug "val=#{val} vals=#{vals.inspect}"
    if vals and vals.has_key?(val)
        rv = vals[val]
        $logger.debug "get_option ok: #{opt} => #{rv.inspect}"
        case rv
        when Symbol
            $logger.debug "get_option re: #{rv}"
            return get_option(opt, rv)
        else
            $logger.debug "get_option true, #{rv}"
            return [true, rv]
        end
    else
        $logger.debug "get_option no: #{opt} => #{val.inspect}"
        return [false, val]
    end
end

#get_optionvalue(opt, val, default = nil) ⇒ Object



243
244
245
246
247
248
249
250
251
252
253
254
255
# File 'lib/websitary/configuration.rb', line 243

def get_optionvalue(opt, val, default=nil)
    case val
    when Symbol
        ok, val = get_option(opt, val)
        if ok
            val
        else
            default
        end
    else
        val
    end
end

#get_output_html(difftext) ⇒ Object



603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
# File 'lib/websitary/configuration.rb', line 603

def get_output_html(difftext)
    difftext = difftext.map do |url, text|
        tags = get(url, :strip_tags)
        text = strip_tags(text, :tags => tags) if tags
        text.empty? ? nil : [url, text]
    end
    difftext.compact!
    sort_difftext!(difftext)

    toc = difftext.map do |url, text|
        ti  = get(url, :title, File.basename(url))
        tid = html_toc_id(url)
        bid = html_body_id(url)
        %{<li id="#{tid}" class="toc"><a class="toc" href="\##{bid}">#{ti}</a></li>}
    end.join("\n")

    idx = 0
    cnt = difftext.map do |url, text|
        idx += 1
        ti   = get(url, :title, File.basename(url))
        bid  = html_body_id(url)
        if (rewrite = get(url, :rewrite_link))
            urlr = eval_arg(rewrite, [url])
            ext  = ''
        else
            old  = %{<a class="old" href="#{file_url(oldname(url))}">old</a>}
            lst  = %{<a class="latest" href="#{file_url(latestname(url))}">latest</a>}
            ext  = %{ (#{old}, #{lst})}
            urlr = url
        end
        note = difftext_annotation(url)
        "<div id=\"\#{bid}\" class=\"webpage\">\n<div class=\"count\">\n\#{idx}\n</div>\n<h1 class=\"diff\">\n<a class=\"external\" href=\"\#{urlr}\">\#{ti}</a>\#{ext}\n</h1>\n<div class=\"annotation\">\n\#{note && CGI::escapeHTML(note)}\n</div>\n<div class=\"diff,difftext\">\n\#{format(url, text)}\n</div>\n</div>\n"
    end.join(('<hr class="separator"/>') + "\n")

    success, template = get_option(:page, :format)
    unless success
        success, template = get_option(:page, :simple)
    end
    return eval_arg(template, [@output_title, toc, cnt])
end

#get_output_rss(difftext) ⇒ Object



551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
# File 'lib/websitary/configuration.rb', line 551

def get_output_rss(difftext)
    success, rss_url = get_option(:rss, :url)
    if success
        success, rss_version = get_option(:rss, :version)
        # require "rss/#{rss_version}"

        rss         = RSS::Rss.new(rss_version)
        chan        = RSS::Rss::Channel.new
        chan.title  = @output_title
        [:description, :copyright, :category, :language, :image, :webMaster, :pubDate].each do |field|
            ok, val = get_option(:rss, field)
            item.send(format_symbol(field, '%s='), val) if ok
        end
        chan.link   = rss_url
        rss.channel = chan

        cnt = difftext.map do |url, text|
            rss_format = get(url, :rss_format, 'plain_text')
            text = strip_tags(text, :format => rss_format)
            next if text.empty?

            item = RSS::Rss::Channel::Item.new
            item.date  = Time.now
            item.title = get(url, :title, File.basename(url))
            item.link  = eval_arg(get(url, :rewrite_link, '%s'), [url])
            [:author, :date, :enclosure, :category, :pubDate].each do |field|
                val = get(url, format_symbol(field, 'rss_%s'))
                item.send(format_symbol(field, '%s='), val) if val
            end

            annotation = difftext_annotation(url)
            annotation = "<pre>#{annotation}</pre>" if annotation
            case rss_format
            when 'plain_text'
                item.description = %{#{annotation}<pre>#{text}</pre>}
            else
                item.description = %{#{annotation}\n#{text}}
            end
            chan.items << item
        end

        return rss.to_s

    else

        $logger.fatal "Global option :rss[:url] not defined."
        exit 5

    end
end

#get_output_text(difftext) ⇒ Object



536
537
538
539
540
541
542
543
544
545
546
547
548
# File 'lib/websitary/configuration.rb', line 536

def get_output_text(difftext)
    difftext.map do |url, difftext|
        if difftext
            difftext = html_to_text(difftext) if is_html?(difftext)
            !difftext.empty? && [
                eval_arg(get(url, :rewrite_link, '%s'), [url]), 
                difftext_annotation(url), 
                nil, 
                difftext
            ].join("\n")
        end
    end.compact.join("\n\n#{('-' * 68)}\n\n")
end

#global(options) ⇒ Object

Set a global option.



378
379
380
381
382
# File 'lib/websitary/configuration.rb', line 378

def global(options)
    options.each do |type, value|
        @options[:global][type] = value
    end
end

#guess_dir(path) ⇒ Object

Guess path’s dirname.

foo/bar     -> foo
foo/bar.txt -> foo
foo/bar/    -> foo/bar


746
747
748
# File 'lib/websitary/configuration.rb', line 746

def guess_dir(path)
    path[-1..-1] == '/' ? path[0..-2] : File.dirname(path)
end

#highlighter(rx, color = nil, group = nil, tag = 'span') ⇒ Object

Return a Proc that takes an text as argument and highlight occurences of rx.

rx

Regular expression

color

A string, sets the class to highlight-color (default: “yellow”)

group

A number (default: 0)

tag

The HTML tag to use (default: “span”)



919
920
921
# File 'lib/websitary/configuration.rb', line 919

def highlighter(rx, color=nil, group=nil, tag='span')
    lambda {|text| text.gsub(rx, %{<#{tag} class="highlight-#{color || 'red'}">\\#{group || 0}</#{tag}>})}
end

#latestname(url, ensure_dir = false, type = nil) ⇒ Object

Get the filename for the freshly downloaded copy.



673
674
675
# File 'lib/websitary/configuration.rb', line 673

def latestname(url, ensure_dir=false, type=nil)
    encoded_filename('latest', url, ensure_dir, type)
end

#oldname(url, ensure_dir = false, type = nil) ⇒ Object

Get the backup filename.



667
668
669
# File 'lib/websitary/configuration.rb', line 667

def oldname(url, ensure_dir=false, type=nil)
    encoded_filename('old', url, ensure_dir, type)
end

#option(type, options) ⇒ Object

Configuration command: Set global options.

type

Symbol

options

Hash



366
367
368
369
370
371
372
373
374
# File 'lib/websitary/configuration.rb', line 366

def option(type, options)
    $logger.info "option #{type}: #{options.inspect}"
    o = @options[type]
    if o
        o.merge!(options)
    else
        $logger.error "Unknown option type: #{type} (#{options.inspect})"
    end
end

#output_file(filename, outformat = nil) ⇒ Object

Set the output file.



357
358
359
# File 'lib/websitary/configuration.rb', line 357

def output_file(filename, outformat=nil)
    @outfile[outformat] = filename
end

#output_format(*format) ⇒ Object

Set the output format.



347
348
349
350
351
352
353
# File 'lib/websitary/configuration.rb', line 347

def output_format(*format)
    unless format.all? {|e| ['text', 'html', 'rss'].include?(e)}
        $logger.fatal "Unknown output format: #{format}"
        exit 5
    end
    @output_format = format
end

#parse_command_line_args(args) ⇒ Object



82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# File 'lib/websitary/configuration.rb', line 82

def parse_command_line_args(args)
    $logger.debug "parse_command_line_args: #{args}"
    opts    = OptionParser.new do |opts|
        opts.banner =  "Usage: #{Websitary::APPNAME} [OPTIONS] [PROFILES] > [OUT]"
        opts.separator ''
        opts.separator "#{Websitary::APPNAME} is a free software with ABSOLUTELY NO WARRANTY under"
        opts.separator 'the terms of the GNU General Public License version 2 or newer.'
        opts.separator ''

        opts.separator 'General Options:'

        opts.on('-c', '--cfg=DIR', String, 'Configuration directory') do |value|
            @cfgdir = value
        end

        opts.on('-e', '--execute=COMMAND', String, 'Define what to do (default: downdiff)') do |value|
            @execute = value
        end

        # opts.on('-E', '--edit=PROFILE', String, 'Edit a profile') do |value|
        #   edit_profile value
        #   exit 0
        # end

        opts.on('-f', '--output-format=FORMAT', 'Output format (html, text, rss)') do |value|
            output_format(*value.split(/,/))
        end

        opts.on('--[no-]ignore-age', 'Ignore age limits') do |bool|
            set :ignore_age => bool
        end

        opts.on('--log=DESTINATION', String, 'Log destination') do |value|
            @logger = Websitary::AppLog.new(value != '-' && value)
        end

        opts.on('-o', '--output=FILENAME', String, 'Output') do |value|
            output_file(value)
        end

        opts.on('-s', '--set=NAME=VAR', String, 'Set a default option') do |value|
            key, val = value.split(/=/, 2)
            set key.intern => eval(val)
        end

        opts.on('-t', '--timer=N', Numeric, 'Repeat every N seconds (never exit)') do |value|
            global(:timer => value)
        end

        opts.on('-x', '--exclude=N', Regexp, 'Exclude URLs matching this pattern') do |value|
            exclude(value)
        end

        opts.separator ''
        opts.separator "Available commands (default: #@execute):"
        commands = @app.methods.map do |m|
            mt = m.match(/^execute_(.*)$/)
            mt && mt[1]
        end
        commands.compact!
        commands.sort!
        opts.separator commands.join(', ')

        opts.separator ''
        opts.separator 'Available profiles:'
        opts.separator Dir[File.join(@cfgdir, '*.rb')].map {|f| File.basename(f, '.*')}.join(', ')

        opts.separator ''
        opts.separator 'Other Options:'

        opts.on('--debug', 'Show debug messages') do |v|
            $VERBOSE = $DEBUG = true
            @logger.set_level(:debug)
        end

        opts.on('-q', '--quiet', 'Be mostly quiet') do |v|
            @logger.set_level(:quiet)
        end

        opts.on('-v', '--verbose', 'Run verbosely') do |v|
            $VERBOSE = true
            @logger.set_level(:verbose)
        end

        opts.on('--version', 'Run verbosely') do |v|
            puts Websitary::VERSION
            exit 1
        end

        opts.on_tail('-h', '--help', 'Show this message') do
            puts opts
            exit 1
        end
    end

    @profiles = opts.parse!(args)
    @profiles = @default_profiles if @profiles.empty?
    cla_handler = "cmdline_arg_#{@execute}"
    cla_handler = nil unless @app.respond_to?(cla_handler)
    for pn in @profiles
        if cla_handler
            @app.send(cla_handler, self, pn)
        else
            profile pn
        end
    end

    self
end

#profile(profile_name) ⇒ Object

Configuration command: Load a profile



293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# File 'lib/websitary/configuration.rb', line 293

def profile(profile_name)
    case profile_name
    when '-'
        readlines.map! {|l| l.chomp}.each {|url| source url}
    when '__END__'
        $logger.debug "Profile: __END__"
        contents = DATA.read
        return eval_profile(contents)
    else
        fn = profile_filename(profile_name)
        if fn
            $logger.debug "Profile: #{fn}"
            contents = File.read(fn)
            return eval_profile(contents, fn)
        else
            $logger.error "Unknown profile: #{profile_name}"
        end
    end
    return false
end

#profile_filename(profile_name, check_file_exists = true) ⇒ Object



942
943
944
945
946
947
948
949
950
951
952
953
954
# File 'lib/websitary/configuration.rb', line 942

def profile_filename(profile_name, check_file_exists=true)
    if File.extname(profile_name) != '.rb'
        profile_name = "#{profile_name}.rb"
    end
    filename = nil
    ['.', @cfgdir].each do |d|
        filename = File.join(d, profile_name)
        if File.exists?(filename)
            return filename
        end
    end
    return check_file_exists ? nil : filename
end

#push_hrefs(url, hpricot, &condition) ⇒ Object

Scan hpricot document for hrefs and push the onto @todo if not already included.



796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
# File 'lib/websitary/configuration.rb', line 796

def push_hrefs(url, hpricot, &condition)
    begin
        return if robots?(hpricot, 'nofollow')
        depth = get(url, :depth)
        return if depth and depth <= 0
        uri0  = URI.parse(url)
        # pn0   = Pathname.new(guess_dir(File.expand_path(uri0.path)))
        pn0   = Pathname.new(guess_dir(uri0.path))
        (hpricot / 'a').each do |a|
            next if a['rel'] == 'nofollow'
            href = a['href']
            next if href.nil? or href == url or href =~ /^\s*javascript:/
                uri  = URI.parse(href)
            pn   = guess_dir(uri.path)
            href = rewrite_href(href, url, uri0, pn0, true)
            curl = canonic_url(href)
            next if !href or href.nil? or @done.include?(curl) or @todo.include?(curl)
            # pn   = Pathname.new(guess_dir(File.expand_path(uri.path)))
            uri  = URI.parse(href)
            pn   = Pathname.new(guess_dir(uri.path))
            next unless condition.call(uri0, pn0, uri, pn)
            next unless robots_allowed?(curl, uri)
            opts = @urls[url].dup
            # opts[:title] = File.basename(curl)
            opts[:title] = [opts[:title], File.basename(curl)].join(' - ')
            opts[:depth] = depth - 1 if depth and depth >= 0
            # opts[:sleep] = delay if delay
            @urls[curl] = opts
            to_do curl
        end
    rescue Exception => e
        # $logger.error e  #DBG#
        $logger.error e.message
        $logger.debug e.backtrace
    end
end

#quicklist(profile_name) ⇒ Object



286
287
288
# File 'lib/websitary/configuration.rb', line 286

def quicklist(profile_name)
    @quicklist_profile = profile_name
end

#rewrite_href(href, url, uri = nil, urd = nil, local = false) ⇒ Object

Try to make href an absolute url.



853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
# File 'lib/websitary/configuration.rb', line 853

def rewrite_href(href, url, uri=nil, urd=nil, local=false)
    begin
        return if !href or href =~ /^\s*javascript:/
        urh   = URI.parse(href)
        uri ||= URI.parse(url)
        urd ||= guess_dir(uri.path)
        rv    = nil
        href  = href.strip

        # $logger.debug "DBG", uri, urh, #DBG#
        if href =~ /\w+:/
            # $logger.debug "DBG href=#$0" #DBG#
            rv = href
        elsif urh.relative?
            # $logger.debug "DBG urh relative" #DBG#
            if uri.relative?
                # $logger.debug "DBG both relative" #DBG#
                if uri.instance_of?(URI::Generic)
                    rv = File.join(urd, href)
                    # $logger.debug "DBG rv=#{rv}" #DBG#
                end
            else
                rv = uri.merge(href).to_s
                # $logger.debug "DBG relativ rv=#{rv}" #DBG#
                if local
                    hf = latestname(rv)
                    if @todo.include?(rv) or @done.include?(rv) or File.exist?(hf)
                        rv = hf
                        # $logger.debug "DBG relativ, local rv=#{rv}" #DBG#
                    end
                end
            end
        elsif href[0..0] == '#'
            # $logger.debug "DBG anchor" #DBG#
            rv = url + href
        elsif uri.host == urh.host
            # $logger.debug "DBG merge" #DBG#
            rv = uri.merge(href).to_s
        else
            # $logger.debug "as is" #DBG#
            rv = href
        end

        case rv
        when String
            return rv
        when nil
        else
            $logger.error "Internal error: href=#{href}"
            $logger.debug caller.join("\n")
        end
        return
    rescue Exception => e
        # $logger.error e  #DBG#
        $logger.error e.message
        $logger.debug e.backtrace
    end
    return nil
end

#rewrite_urls(url, doc) ⇒ Object

Rewrite urls in doc

url

String

doc

Hpricot document



837
838
839
840
841
842
843
844
845
846
847
848
849
# File 'lib/websitary/configuration.rb', line 837

def rewrite_urls(url, doc)
    uri = URI.parse(url)
    urd = guess_dir(uri.path)
    (doc / 'a').each do |a|
        href = rewrite_href(a['href'], url, uri, urd, true)
        a['href'] = href if href
    end
    (doc / 'img').each do |a|
        href = rewrite_href(a['src'], url, uri, urd, false)
        a['src'] = href if href
    end
    doc
end

#set(options) ⇒ Object

Configuration command: Set the default value for source-options.



387
388
389
390
# File 'lib/websitary/configuration.rb', line 387

def set(options)
    $logger.debug "set: #{options.inspect}"
    @default_options.merge!(options)
end

#shortcut(symbol, args) ⇒ Object

Define a options shortcut.



316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
# File 'lib/websitary/configuration.rb', line 316

def shortcut(symbol, args)
    ak = args.keys
    ok = @options.keys
    dk = ok - ak

    # :downloadprocess
    if !ak.include?(:delegate) and
        dk.any? {|e| [:download, :downloadformat, :diff, :format, :diffprocess].include?(e)}
        $logger.warn "Shortcut #{symbol}: Undefined fields: #{dk.inspect}"
    end

    if ak.include?(:delegate)
        dk.each do |field|
            @options[field][symbol] = args[:delegate]
        end
    end

    args.each do |field, val|
        @options[field][symbol] = val unless field == :delegate
    end
end

#show_output(difftext) ⇒ Object

Generate & view the final output.

difftext

Hash



503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
# File 'lib/websitary/configuration.rb', line 503

def show_output(difftext)
    if difftext.empty?
        msg = ['No news is good news']
        msg << "try again in #{@app.format_tdiff(@app.tdiff_min)}" if @app.tdiff_min
        $logger.warn msg.join('; ')
        return 0
    end

    @output_format.each do |outformat|
        meth = "get_output_#{outformat}"

        unless respond_to?(meth)
            $logger.fatal "Unknown output format: #{outformat}"
            exit 5
        end

        out = send(meth, difftext)
        if out
            outfile = get_outfile(outformat)
            case outfile
            when '-'
                puts out
            else
                write_file(outfile) {|io| io.puts out}
                meth = "view_output_#{outformat}"
                self.send(meth, outfile)
            end
        end
    end
    return 1
end

#source(urls, opts = {}) ⇒ Object

Configuration command: Define a source.

urls

String



405
406
407
408
409
410
# File 'lib/websitary/configuration.rb', line 405

def source(urls, opts={})
    urls.split("\n").flatten.compact.each do |url|
        @urls[url] = @default_options.dup.update(opts)
        to_do url
    end
end

#strip_tags(doc, args = {}) ⇒ Object



763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
# File 'lib/websitary/configuration.rb', line 763

def strip_tags(doc, args={})
    tags = args[:tags] || strip_tags_default
    case doc
    when String
        doc = Hpricot(doc)
    end
    tags.each do |tag|
        doc.search(tag).remove
    end
    case args[:format]
    when :hpricot
        doc
    else
        doc.send("to_#{args[:format] || :html}")
    end
end

#strip_tags_defaultObject



757
758
759
760
# File 'lib/websitary/configuration.rb', line 757

def strip_tags_default
    success, tags = get_option(:strip_tags, :default)
    tags.dup if success
end

#to_do(url) ⇒ Object



339
340
341
342
343
# File 'lib/websitary/configuration.rb', line 339

def to_do(url)
    unless @exclude.any? {|p| url =~ p}
        @todo << url
    end
end

#unset(*options) ⇒ Object

Configuration command: Unset a default source-option.



395
396
397
398
399
# File 'lib/websitary/configuration.rb', line 395

def unset(*options)
    for option in options
        @default_options.delete(option)
    end
end

#url_from_filename(filename) ⇒ Object



678
679
680
681
682
683
684
685
686
# File 'lib/websitary/configuration.rb', line 678

def url_from_filename(filename)
    rv = @urlencmap[filename]
    if rv
        $logger.debug "Map filename: #{filename} -> #{rv}"
    else
        $logger.warn "Unmapped filename: #{filename}"
    end
    rv
end

#urlextname(url) ⇒ Object



734
735
736
737
738
739
# File 'lib/websitary/configuration.rb', line 734

def urlextname(url)
    begin
        return File.extname(URI.parse(url).path)
    rescue Exception => e
    end
end

#view(view) ⇒ Object

Configuration command: Set the viewer.



445
446
447
# File 'lib/websitary/configuration.rb', line 445

def view(view)
    @view = view
end

#view_output(outfile = nil) ⇒ Object



924
925
926
# File 'lib/websitary/configuration.rb', line 924

def view_output(outfile=nil)
    send("view_output_#{@output_format[0]}", outfile || get_outfile)
end

#write_file(filename, mode = 'w', &block) ⇒ Object



957
958
959
960
# File 'lib/websitary/configuration.rb', line 957

def write_file(filename, mode='w', &block)
    File.open(filename, mode) {|io| block.call(io)}
    @mtimes.set(filename)
end