Class: Dblp::Grabber

Inherits:
Object
  • Object
show all
Defined in:
lib/dblp/grabber.rb

Direct Known Subclasses

CiteseerGrabber

Constant Summary collapse

DBLP_URL =

Const url to fetch from

"http://dblp.uni-trier.de/rec/bibtex/"

Instance Method Summary collapse

Constructor Details

#initialize(options = nil) ⇒ Grabber

Returns a new instance of Grabber.



13
14
15
# File 'lib/dblp/grabber.rb', line 13

def initialize(options = nil)
  @options = options
end

Instance Method Details

#extract_pre(content) ⇒ Object

Extracts all relevant information from the <pre> elements from the dblp page. There is one special case to handle. If there are multiple <pre> elements there is a cross reference used. We have to check if we include the cross reference or extract the short version.



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'lib/dblp/grabber.rb', line 30

def extract_pre(content)
  # extract the bibtex code, that is in pre tags
  pres = content.scan(/<pre>(.*?)<.pre>/mix)
  
  if pres

    # First handle main entry
    result = []
    return [] if pres.size == 0

    result << pres[0][0].gsub(/(<.*?>)/, "").gsub(/^\s+title\s+=\s+\{(.*?)\},/m, "  title     = {{\\1}},")

    # Find the crossref in the second <pre>
    if pres.size > 1

      if @options && @options.crossref
        result << pres[1][0].gsub(/(<.*?>)/, "").gsub(/^\s+title\s+=\s+\{(.*?)\},$/m, "  title     = {{\\1}},")
      else
        booktitle = pres[1][0].match(/^\s+title\s+=\s+\{(.*?)\},$/m)


        # If we find a booktitle, replace the book title with the
        # one from the crossref
        if booktitle
          unless @options.short
            cleantitle = booktitle[1].gsub(/\n|\t|\s+/, " ")
            result[0].gsub!(/^\s+booktitle\s+=\s+\{(.*?)\},$/m){|match|
              "  booktitle = {{#{cleantitle}}},"
            }
          end

          publisher = pres[1][0].match(/^\s+publisher\s+=\s+\{(.*?)\},/m)
          publisher_data = publisher ? "  publisher = {{#{publisher[1]}}}," : ""

          # TODO make cross ref handling configurable
          result[0].gsub!(/^\s+crossref\s+=\s+\{(.*?)\},/m, publisher_data)
        end
      end
    end
    result
  else
    []
  end
end

#grab(key) ⇒ Object



76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/dblp/grabber.rb', line 76

def grab(key)
  begin 
    # Check the key 
    if key =~ /DBLP:/
      content = read_html(DBLP_URL + key.gsub("DBLP:", ""))
      extract_pre(content)
    else
      #CiteseerGrabber.new.grab(key)
      []
    end
  rescue Exception => e
    puts e.message
    puts e.backtrace.inspect
    []
  end
end

#read_html(url) ⇒ Object



17
18
19
20
21
22
23
# File 'lib/dblp/grabber.rb', line 17

def read_html(url)
  content = ""
  open(url) do |f|
    content = f.read
  end
  content
end