Class: LiveBlogIndexer

Inherits:
Object
  • Object
show all
Defined in:
lib/liveblog-indexer.rb

Instance Method Summary collapse

Constructor Details

#initialize(filepath: '.', word_index: 'wordindex.json', url_index: 'url_index.json') ⇒ LiveBlogIndexer

Returns a new instance of LiveBlogIndexer.



12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# File 'lib/liveblog-indexer.rb', line 12

def initialize(filepath: '.', word_index: 'wordindex.json', \
                                                 url_index: 'url_index.json')

  @filepath, @wordindex_filepath, @urls_index_filepath = filepath, \
                                                      word_index, url_index
  @master = if word_index and File.exists? File.join(filepath,word_index) then
    JSON.parse(File.read(File.join(filepath, word_index)))
  else
    {}
  end

  @xws = XWS.new
      
  @url_index = if url_index and \
                           File.exists? File.join(filepath, url_index) then
    JSON.parse(File.read(File.join(filepath, url_index)))
  else
    {}
  end

end

Instance Method Details

#add_index(src) ⇒ Object



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/liveblog-indexer.rb', line 34

def add_index(src)

  doc = if src.is_a? String then
    Rexle.new(RXFHelper.read(src).first )
  else
    src
  end
  
  link = doc.root.element('summary/link/text()')
  return unless link
  
  sections = doc.root.xpath 'records/section'

  sections.each do |section|

    url = "%s/#%s" % [link[/^https?:\/\/[^\/]+(.*)(?=\/$)/,1], \
                                                     section.attributes[:id]]
    h = @xws.scan section.element('details')
    
    h.each do |k, v|
      
      word, count = k, v

      keyword = @master[word]

      if keyword then

        keyword[url] = count
        
      else

        @master[word] = {}
        @master[word][url] = count

      end # /keyword
    end # /h
  end # /section
  
  true
end


106
107
108
109
110
111
112
113
114
115
# File 'lib/liveblog-indexer.rb', line 106

def add_links(a)

  a.sort_by do |uri|

    string = uri[/^\/liveblog\/(\d{4}\/\w{3}\/\d+)/,1].gsub('/','')
    Date.strptime(string, "%Y%b%d")
    
  end.reverse

end

#crawl(location) ⇒ Object

/add_index



75
76
77
78
79
80
81
# File 'lib/liveblog-indexer.rb', line 75

def crawl(location)
  
  index_file location
  save @wordindex_filepath
  File.write File.join(@filepath, @urls_index_filepath), @url_index.to_json
  
end

#inspectObject



83
84
85
# File 'lib/liveblog-indexer.rb', line 83

def inspect()
  "#<LiveBlogIndexer:#{self.object_id}>"
end

#keysObject



87
88
89
# File 'lib/liveblog-indexer.rb', line 87

def keys()
  @master.keys
end

#save(filename = 'wordindex.json') ⇒ Object



91
92
93
94
95
96
# File 'lib/liveblog-indexer.rb', line 91

def save(filename='wordindex.json')

  File.write File.join(@filepath, filename), @master.to_json
  puts 'saved ' + filename

end

#search(keyword) ⇒ Object

search the word_index file with a keyword e.g. hdmi results are sorted by exact match with a hashtag, exact keyword match, followed by words which contain the keyword e.g. micro-hdmi

links for each result are sorted by date



104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# File 'lib/liveblog-indexer.rb', line 104

def search(keyword)

  def add_links(a)

    a.sort_by do |uri|

      string = uri[/^\/liveblog\/(\d{4}\/\w{3}\/\d+)/,1].gsub('/','')
      Date.strptime(string, "%Y%b%d")
      
    end.reverse

  end        
  
  h = @master
  grepped = h.keys.grep /#{keyword}/i
  # e.g. => ["hdmi=safe", "hdmi", "micro-hdmi", "#hdmi"]

  # sort results by importance
  r = []

  # hashtag should be first
  key = grepped.delete '#' + keyword
  r.concat add_links(h[key].keys) if key

  # the word on its own
  key = grepped.delete keyword
  r.concat add_links(h[key].keys) if key
  
  r.concat grepped.flat_map {|x| add_links(h[x].keys)}
  
  return r.uniq
  
end