Class: Blacklight

Inherits:
Service show all
Includes:
MarcHelper, MetadataHelper, UmlautHttp, XmlSchemaHelper
Defined in:
app/service_adaptors/blacklight.rb

Overview

Searches a Blacklight with the cql extension installed.

Params include:

base_url

required. Complete URL to catalog.atom action. Eg “blacklight.mse.jhu.edu/catalog.atom

bl_fields

required with at least some entries if you want this to do anything. Describe the names of given semantic fields in your BL instance.

  • issn

  • isbn

  • lccn

  • oclcnum

  • id (defaults to ‘id’)

  • title

  • author

  • serials_limit_clause => not an index name, full URL clause for a limit to apply to known serials searches, for instance “f[]=Serial”

identifier_search

Do catalog search on issn/isbn/oclcnum/lccn/bibId. Default true.

keyword_search

Do catalog search on title/author keywords where applicable. Generally only used when identifier_search finds no hits, if identifier_search is on. Default true.

keyword_per_page

How many records to fetch from blacklight when doing keyword searches.

exclude_holdings

Can be used to exclude certain ‘dummy’ holdings that have certain collection, location, or other values. Eg: exclude_holdings:

collection_str:
  - World Wide Web
  - Internet
rft_id_bibnum_prefixes

Array of URI prefixes in an rft_id that indicate that the actual solr id comes next. For instance, if your blacklight will send “blacklight.com/catalog/some_id” in an rft_id, then include “blacklight.com/catalog/”. Optional.

Constant Summary

Constants inherited from Service

Service::LinkOutFilterTask, Service::StandardTask

Instance Attribute Summary collapse

Attributes inherited from Service

#group, #name, #priority, #request, #service_id, #status, #task, #url

Instance Method Summary collapse

Methods included from XmlSchemaHelper

xml_ns, #xml_ns, #xml_to_holdings

Methods included from MarcHelper

#add_856_links, #edition_statement, #get_title, #get_years, #gmd_values, #service_type_for_856, #should_skip_856_link?, #strip_gmd

Methods included from MetadataHelper

#get_doi, #get_epage, #get_gpo_item_nums, #get_identifier, #get_isbn, #get_issn, #get_lccn, #get_month, #get_oclcnum, #get_pmid, #get_search_creator, #get_search_terms, #get_search_title, #get_spage, #get_sudoc, #get_top_level_creator, #get_year, #normalize_lccn, #normalize_title, #raw_search_title, title_is_serial?

Methods included from UmlautHttp

#http_fetch, #proxy_like_headers

Methods inherited from Service

#credits, #display_name, #handle_wrapper, #link_out_filter, #preempted_by, required_config_params, #response_url, #translate

Constructor Details

#initialize(config) ⇒ Blacklight

Returns a new instance of Blacklight.



46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'app/service_adaptors/blacklight.rb', line 46

def initialize(config)
  # defaults    
  # If you are sending an OpenURL from a library service, you may
  # have the HIP bibnum, and include it in the OpenURL as, eg.
  # rft_id=http://catalog.library.jhu.edu/bib/343434 (except URL-encoded)
  # Then you'd set rft_id_bibnum_prefix to http://catalog.library.jhu.edu/bib/
  @rft_id_bibnum_prefixes = []
  @cql_search_field = "cql"
  @keyword_per_page = 10
  @identifier_search = true
  @keyword_search = true
  @link_to_search = true
  super(config)
  @bl_fields = { "id" => "id "}.merge(@bl_fields)
end

Instance Attribute Details

#base_urlObject (readonly)

Returns the value of attribute base_url.



38
39
40
# File 'app/service_adaptors/blacklight.rb', line 38

def base_url
  @base_url
end

#bl_fieldsObject (readonly)

Returns the value of attribute bl_fields.



39
40
41
# File 'app/service_adaptors/blacklight.rb', line 39

def bl_fields
  @bl_fields
end

#cql_search_fieldObject (readonly)

Returns the value of attribute cql_search_field.



38
39
40
# File 'app/service_adaptors/blacklight.rb', line 38

def cql_search_field
  @cql_search_field
end

#issnObject (readonly)

Returns the value of attribute issn.



39
40
41
# File 'app/service_adaptors/blacklight.rb', line 39

def issn
  @issn
end

Instance Method Details

#add_holdings(holdings_url, options = {}) ⇒ Object

Takes a url that will return atom response of dlf_expanded content. Adds Umlaut “holding” ServiceResponses for dlf_expanded, as appropriate. Returns number of holdings added.



249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# File 'app/service_adaptors/blacklight.rb', line 249

def add_holdings(holdings_url, options = {})
  options[:match_reliability] ||= ServiceResponse::MatchExact
  options[:marc_data] ||= {}
  
  atom = Nokogiri::XML( http_fetch(holdings_url).body )
  content_entries = atom.search("/atom:feed/atom:entry/atom:content", xml_ns)
  
  # For each atom entry, find the dlf_expanded record. For each dlf_expanded
  # record, take all of it's holdingsrec's if it has them, or all of it's
  # items if it doesn't, and add them to list. We wind up with a list
  # of mixed holdingsrec's and items. 
  holdings_xml = content_entries.collect do |dlf_expanded|      
    copies = dlf_expanded.xpath("dlf:record/dlf:holdings/dlf:holdingset/dlf:holdingsrec", xml_ns)
    copies.length > 0 ? copies : dlf_expanded.xpath("dlf:record/dlf:items/dlf:item", xml_ns)
  end.flatten
  
  service_data = holdings_xml.collect do |  |
    atom_entry = .at_xpath("ancestor::atom:entry", xml_ns)
    atom_id = atom_entry.at_xpath("atom:id/text()", xml_ns).to_s

    edition_str = edition_statement(options[:marc_data][atom_id])
    url = atom_entry.at_xpath("atom:link[@rel='alternate'][@type='text/html']/attribute::href", xml_ns).to_s
    
    xml_to_holdings(  ).merge(
      :service => self,
      :match_reliability => options[:match_reliability],
      :edition_str => edition_str,
      :url => url
    )
  end
  
  # strip out holdings that aren't really holdings
  service_data.delete_if do |data|
    @exclude_holdings.collect do |key, values|
      values.include?(data[key.to_sym])
    end.include?(true)
  end

  # Sort by "collection"
  service_data.sort do |a, b|
    a[:collection_str] <=> b[:collection_str]
  end
  
  service_data.each do |data|
    request.add_service_response(data.merge(:service => self, :service_type_value =>"holding"))
  end

  return service_data.length
end

#bib_ids_from_atom_entries(entries) ⇒ Object



332
333
334
335
336
337
# File 'app/service_adaptors/blacklight.rb', line 332

def bib_ids_from_atom_entries(entries)
  entries.xpath("atom:id/text()", xml_ns).to_a.collect do |atom_id|
        atom_id.to_s =~ /([^\/]+)$/
        $1
  end.compact
end

#blacklight_keyword_search_url(request, options = {}) ⇒ Object

Construct a CQL search against blacklight for author and title, possibly with serial limit. Ask for Atom with embedded MARC back.



193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'app/service_adaptors/blacklight.rb', line 193

def blacklight_keyword_search_url(request, options = {})
  options[:format] ||= "atom"
  options[:content_format] ||= "marc"
  
  clauses = []

  # We need both title and author to search keyword style, or
  # we get too many false positives. Except serials we'll do
  # title only. sigh, logic tree. 
  
  # Also need to use appropriate 'container' title if avail, not
  # article title. 
   = request.referent.
  title = ['jtitle']     
  title = ['btitle'] if title.blank?
  title = ['title'] if title.blank?
  # remove sub-title for better search
  title.gsub!(/\:.*\Z/, '') if title

  author = get_top_level_creator(request.referent)
  return nil unless title && (author || (@bl_fields["serials_limit_clause"] && title_is_serial?(request.referent)))
  # phrase search for title, just raw dismax for author
  # Embed quotes inside the quoted value, need to backslash-quote for CQL,
  # and backslash the backslashes for ruby literal. 
  clauses.push("#{@bl_fields["title"]} = \"\\\"#{escape_for_cql_double_quotes title}\\\"\"")    
  clauses.push("#{@bl_fields["author"]} = \"#{escape_for_cql_double_quotes author}\"") if author
  
  url = base_url + "?search_field=#{@cql_search_field}&content_format=#{options[:content_format]}&q=#{CGI.escape(clauses.join(" AND "))}"

  if (@bl_fields["serials_limit_clause"] &&
      title_is_serial?(request.referent))        
    url += "&" + @bl_fields["serials_limit_clause"]
  end
  
  return url
end

#blacklight_precise_search_url(request, format = "marc") ⇒ Object

Send a CQL request for any identifiers present. Ask for for an atom response with embedded marc21 back.



158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
# File 'app/service_adaptors/blacklight.rb', line 158

def blacklight_precise_search_url(request, format = "marc")

  # Add search clauses for our identifiers, if we have them and have a configured search field for them. 
  clauses = []
  added = []
  ["lccn", "isbn", "oclcnum"].each do |key|
    if bl_fields[key] && request.referent.send(key)
      clauses.push( "#{bl_fields[key]} = \"#{request.referent.send(key)}\"")
      added << key
    end
  end
  # Only add ISSN if we don't have an ISBN, reduces false matches
  if ( !added.include?("isbn") &&
       bl_fields["issn"] && 
       request.referent.issn)
     clauses.push("#{bl_fields["issn"]} = \"#{request.referent.issn}\"")
  end
    
  
  # Add Solr document identifier if we can get one from the URL
  
  if (id = get_solr_id(request.referent))
    clauses.push("#{bl_fields['id']} = \"#{id}\"")
  end
  
  # if we have nothing, we can do no search.
  return nil if clauses.length == 0
  
  cql = clauses.join(" OR ")
  
  return base_url + "?search_field=#{@cql_search_field}&content_format=#{format}&q=#{CGI.escape(cql)}"             
end

#blacklight_url_for_ids(ids, format = "dlf_expanded") ⇒ Object



339
340
341
342
343
# File 'app/service_adaptors/blacklight.rb', line 339

def blacklight_url_for_ids(ids, format="dlf_expanded")
  return nil unless ids.length > 0  

  return base_url + "?search_field=#{@cql_search_field}&content_format=#{format}&q=" + CGI.escape("#{@bl_fields["id"]} any \"#{ids.join(" ")}\"")
end

#escape_for_cql_double_quotes(str) ⇒ Object



238
239
240
241
242
243
# File 'app/service_adaptors/blacklight.rb', line 238

def escape_for_cql_double_quotes(str)
  str = str.gsub('"', " ")
  str = str.gsub("'", "''")

  return str
end

#filter_keyword_entries(request, atom_entries, options = {}) ⇒ Object



299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
# File 'app/service_adaptors/blacklight.rb', line 299

def filter_keyword_entries(request, atom_entries, options = {})
  options[:exclude_ids] ||= []
  options[:remove_subtitle] ||= true
  
  title = request.referent['jtitle']     
  title = request.referent['btitle'] if title.blank?
  title = request.referent['title'] if title.blank?
  
  request_title_forms = [
      title.downcase,        
      normalize_title( title )
  ]
  request_title_forms << normalize_title( title, :remove_subtitle => true) if options[:remove_subtitle]
  request_title_forms = request_title_forms.compact.uniq

  # Only keep entries with title match, and that aren't in the
  # exclude_ids list. 
  good_entries = atom_entries.find_all do |atom_entry|
    title = atom_entry.xpath("atom:title/text()", xml_ns).text  
 
    entry_title_forms = [
      title.downcase,
      normalize_title(title)
    ]
    entry_title_forms << normalize_title(title, :remove_subtitle=>true) if options[:remove_subtitle]
    entry_title_forms = entry_title_forms.compact.uniq
    
    ((entry_title_forms & request_title_forms).length > 0 &&
     (bib_ids_from_atom_entries(atom_entry) & options[:exclude_ids]).length == 0)
  end
  return Nokogiri::XML::NodeSet.new( atom_entries.document, good_entries)
end

#get_solr_id(rft) ⇒ Object



346
347
348
349
350
351
352
353
354
355
356
# File 'app/service_adaptors/blacklight.rb', line 346

def get_solr_id(rft)
  rft.identifiers.each do |id|
    @rft_id_bibnum_prefixes.each do |prefix|
      if id[0, prefix.length] == prefix
        return id[prefix.length, id.length]
      end
    end
  end

  return nil    
end

#handle(request) ⇒ Object



70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# File 'app/service_adaptors/blacklight.rb', line 70

def handle(request)
  ids_processed = []
  holdings_added = 0
  
  if (@identifier_search && url = blacklight_precise_search_url(request) )
    doc = Nokogiri::XML( http_fetch(url).body )
    
    ids_processed.concat( bib_ids_from_atom_entries( doc.xpath("atom:feed/atom:entry", xml_ns) ) )

    # namespaces make xpath harder than it should be, but css
    # selector still easy, thanks nokogiri! Grab the marc from our
    # results. 
    marc_matches = doc.xpath("atom:feed/atom:entry/atom:content[@type='application/marc']", xml_ns).collect do |encoded_marc21|
      MARC::Reader.decode( Base64.decode64(encoded_marc21.text).force_encoding("UTF-8") )        
    end

    add_856_links(request, marc_matches )

    # Got to make a second fetch for dlf_expanded info, cause BL doens't
    # (yet) let us ask for more than one at once
    holdings_url = blacklight_precise_search_url( request, "dlf_expanded" )
    holdings_added += add_holdings( holdings_url ) if holdings_url
  end

  #keyword search.    
  if (@keyword_search &&
      url = blacklight_keyword_search_url(request))
          
      doc = Nokogiri::XML( http_fetch(url).body )
      # filter out matches whose titles don't really match at all, or
      # which have already been seen in identifier search. 
      entries = filter_keyword_entries(request, doc.xpath("atom:feed/atom:entry", xml_ns) , :exclude_ids => ids_processed, :remove_subtitle => (! title_is_serial?(request.referent)) )
      
      
      marc_by_atom_id = {}
      
      # Grab the marc from our entries. Important not to do a // xpath
      # search, or we'll wind up matching parent elements not actually
      # included in our 'entries' list. 
      marc_matches = entries.xpath("atom:content[@type='application/marc']", xml_ns).collect do |encoded_marc21|
        marc = MARC::Reader.decode( Base64.decode64(encoded_marc21.text).force_encoding("UTF-8") )

        marc_by_atom_id[ encoded_marc21.at_xpath("ancestor::atom:entry/atom:id/text()", xml_ns).to_s  ] = marc
        
        marc
      end
     
      # We've filtered out those we consider just plain bad
      # matches, everything else we're going to call
      # an approximate match. Sort so that those with
      # a date close to our request date are first.
      if ( year = get_year(request.referent))
        marc_matches = marc_matches.partition {|marc| get_years(marc).include?( year )}.flatten
      end
      # And add in the 856's
      add_856_links(request, marc_matches, :match_reliability => ServiceResponse::MatchUnsure)

      # Fetch and add in the holdings
      url = blacklight_url_for_ids(bib_ids_from_atom_entries(entries))
      
      holdings_added += add_holdings( url, :match_reliability => ServiceResponse::MatchUnsure, :marc_data => marc_by_atom_id ) if url
      
      if (@link_to_search && holdings_added ==0)
        hit_count = doc.at_xpath("atom:feed/opensearch:totalResults/text()", xml_ns).to_s.to_i
        html_result_url = doc.at_xpath("atom:feed/atom:link[@rel='alternate'][@type='text/html']/attribute::href", xml_ns).to_s

        if hit_count > 0          
          request.add_service_response(             
            :service => self,
            :source_name => @display_name,
            :count => hit_count,
            :display_text => "#{hit_count} possible #{case; when hit_count > 1 ; 'matches' ; else; 'match' ; end} in #{@display_name}", 
            :url => html_result_url,
            :service_type_value => :holding_search )
        end
      end
  end


  
  
  return request.dispatched(self, true)


end

#service_types_generatedObject

Standard method, used by background service updater. See Service docs.



63
64
65
66
67
# File 'app/service_adaptors/blacklight.rb', line 63

def service_types_generated    
  types = [ ServiceTypeValue[:fulltext], ServiceTypeValue[:holding], ServiceTypeValue[:table_of_contents], ServiceTypeValue[:relevant_link] ]
  
  return types
end