Class: Dblp::Grabber

Inherits:

Object

Object
Dblp::Grabber

show all

Defined in:: lib/dblp/grabber.rb

Direct Known Subclasses

CiteseerGrabber

Constant Summary collapse

DBLP_URL = Const url to fetch from

"http://dblp.uni-trier.de/rec/bibtex/"

Instance Method Summary collapse

#extract_pre(content) ⇒ Object

Extracts all relevant information from the <pre> elements from the dblp page.
#grab(key) ⇒ Object
#initialize(options = nil) ⇒ Grabber constructor

A new instance of Grabber.
#read_html(url) ⇒ Object

Constructor Details

#initialize(options = nil) ⇒ `Grabber`

Returns a new instance of Grabber.



13
14
15

# File 'lib/dblp/grabber.rb', line 13

def initialize(options = nil)
  @options = options
end

Instance Method Details

#extract_pre(content) ⇒ `Object`

Extracts all relevant information from the <pre> elements from the dblp page. There is one special case to handle. If there are multiple <pre> elements there is a cross reference used. We have to check if we include the cross reference or extract the short version.

# File 'lib/dblp/grabber.rb', line 30

def extract_pre(content)
  # extract the bibtex code, that is in pre tags
  pres = content.scan(/<pre>(.*?)<.pre>/mix)
  
  if pres

    # First handle main entry
    result = []
    return [] if pres.size == 0

    result << pres[0][0].gsub(/(<.*?>)/, "").gsub(/^\s+title\s+=\s+\{(.*?)\},/m, "  title     = {{\\1}},")

    # Find the crossref in the second <pre>
    if pres.size > 1

      if @options && @options.crossref
        result << pres[1][0].gsub(/(<.*?>)/, "").gsub(/^\s+title\s+=\s+\{(.*?)\},$/m, "  title     = {{\\1}},")
      else
        booktitle = pres[1][0].match(/^\s+title\s+=\s+\{(.*?)\},$/m)


        # If we find a booktitle, replace the book title with the
        # one from the crossref
        if booktitle
          unless @options.short
            cleantitle = booktitle[1].gsub(/\n|\t|\s+/, " ")
            result[0].gsub!(/^\s+booktitle\s+=\s+\{(.*?)\},$/m){|match|
              "  booktitle = {{#{cleantitle}}},"
            }
          end

          publisher = pres[1][0].match(/^\s+publisher\s+=\s+\{(.*?)\},/m)
          publisher_data = publisher ? "  publisher = {{#{publisher[1]}}}," : ""

          # TODO make cross ref handling configurable
          result[0].gsub!(/^\s+crossref\s+=\s+\{(.*?)\},/m, publisher_data)
        end
      end
    end
    result
  else
    []
  end
end

#grab(key) ⇒ `Object`

# File 'lib/dblp/grabber.rb', line 76

def grab(key)
  begin 
    # Check the key 
    if key =~ /DBLP:/
      content = read_html(DBLP_URL + key.gsub("DBLP:", ""))
      extract_pre(content)
    else
      #CiteseerGrabber.new.grab(key)
      []
    end
  rescue Exception => e
    puts e.message
    puts e.backtrace.inspect
    []
  end
end

#read_html(url) ⇒ `Object`

# File 'lib/dblp/grabber.rb', line 17

def read_html(url)
  content = ""
  open(url) do |f|
    content = f.read
  end
  content
end