Class: Dblp::Grabber
- Inherits:
-
Object
- Object
- Dblp::Grabber
- Defined in:
- lib/dblp/grabber.rb
Direct Known Subclasses
Constant Summary collapse
- DBLP_URL =
Const url to fetch from
"http://dblp.uni-trier.de/rec/bibtex/"
Instance Method Summary collapse
-
#extract_pre(content) ⇒ Object
Extracts all relevant information from the <pre> elements from the dblp page.
- #grab(key) ⇒ Object
-
#initialize(options = nil) ⇒ Grabber
constructor
A new instance of Grabber.
- #read_html(url) ⇒ Object
Constructor Details
#initialize(options = nil) ⇒ Grabber
Returns a new instance of Grabber.
13 14 15 |
# File 'lib/dblp/grabber.rb', line 13 def initialize( = nil) @options = end |
Instance Method Details
#extract_pre(content) ⇒ Object
Extracts all relevant information from the <pre> elements from the dblp page. There is one special case to handle. If there are multiple <pre> elements there is a cross reference used. We have to check if we include the cross reference or extract the short version.
30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# File 'lib/dblp/grabber.rb', line 30 def extract_pre(content) # extract the bibtex code, that is in pre tags pres = content.scan(/<pre>(.*?)<.pre>/mix) if pres # First handle main entry result = [] return [] if pres.size == 0 result << pres[0][0].gsub(/(<.*?>)/, "").gsub(/^\s+title\s+=\s+\{(.*?)\},/m, " title = {{\\1}},") # Find the crossref in the second <pre> if pres.size > 1 if @options && @options.crossref result << pres[1][0].gsub(/(<.*?>)/, "").gsub(/^\s+title\s+=\s+\{(.*?)\},$/m, " title = {{\\1}},") else booktitle = pres[1][0].match(/^\s+title\s+=\s+\{(.*?)\},$/m) # If we find a booktitle, replace the book title with the # one from the crossref if booktitle unless @options.short cleantitle = booktitle[1].gsub(/\n|\t|\s+/, " ") result[0].gsub!(/^\s+booktitle\s+=\s+\{(.*?)\},$/m){|match| " booktitle = {{#{cleantitle}}}," } end publisher = pres[1][0].match(/^\s+publisher\s+=\s+\{(.*?)\},/m) publisher_data = publisher ? " publisher = {{#{publisher[1]}}}," : "" # TODO make cross ref handling configurable result[0].gsub!(/^\s+crossref\s+=\s+\{(.*?)\},/m, publisher_data) end end end result else [] end end |
#grab(key) ⇒ Object
76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
# File 'lib/dblp/grabber.rb', line 76 def grab(key) begin # Check the key if key =~ /DBLP:/ content = read_html(DBLP_URL + key.gsub("DBLP:", "")) extract_pre(content) else #CiteseerGrabber.new.grab(key) [] end rescue Exception => e puts e. puts e.backtrace.inspect [] end end |
#read_html(url) ⇒ Object
17 18 19 20 21 22 23 |
# File 'lib/dblp/grabber.rb', line 17 def read_html(url) content = "" open(url) do |f| content = f.read end content end |