Class: InternetArchive

Inherits:
Service show all
Includes:
MetadataHelper
Defined in:
app/service_adaptors/internet_archive.rb

Constant Summary collapse

SERVICE_TYPE_MAP =

maps the IA mediatype to Umlaut service type

{
  "texts" => :fulltext,
  "audio" => :audio
}
COLLECTION_LABELS =

collection labels

list of collection labels can be found here: www.archive.org/advancedsearch.php?q=mediatype%3Acollection&fl[]=collection&fl[]=identifier&fl[]=title&sort[]=&sort[]=&sort[]=&rows=9999&indent=yes&fmt=json&xmlsearch=Search FIXME either get these dynamically at intervals or add a fuller set below.

Currently there are over 4300 collections.

If we’re going to do this as a static hash then it should be a class constant. Currently this hash contains a small selection of collections which include the ‘audio’ mediatype and all that contain the ‘texts’ mediatype.

{
  "CaliforniaFishandGame"=>"California Fish and Game",
  "ol_data"=>"Open Library Data",
  "worldhealthorganization"=>"World Health Organization",
  "opensource_movies"=>"Open Source Movies",
  "clairetcarneylibrary"=>
    "Claire T. Carney Library, University of Massachusetts Dartmouth",
  "university_of_illinois_urbana-champaign"=>
    "University of Illinois Urbana-Champaign",
  "smithsonian_books"=>"Smithsonian",
  "nhml_london"=>"Natural History Museum Library, London",
  "animationandcartoons"=>"Animation & Cartoons",
  "university_of_toronto_regis"=>"Regis College Library",
  "vlogs"=>"Vlogs",
  "opensource"=>"Open Source Books",
  "USGovernmentDocuments"=>"US Government Documents",
  "danceman"=>"Dance Manuals",
  "additional_collections"=>"Additional Collections",
  "internet_archive_books"=>"Internet Archive Books",
  "sloan"=>"Sloan Foundation",
  "iacl"=>"Children's Library",
  "audio_religion"=>"Spirituality & Religion",
  "microfilm"=>"Books from Microfilm",
  "toronto"=>"Canadian Libraries",
  "prelinger"=>"Prelinger Archives",
  "bostonpubliclibrary"=>"Boston Public Library",
  "sports"=>"Sports Videos",
  "universallibrary"=>"Universal Library",
  "sfpl"=>"The San Francisco Public Library",
  "university_of_toronto_knox"=>"Caven Library, Knox College",
  "memorial_university"=>"Memorial University of Newfoundland & Labrador",
  "MBLWHOI"=>"MBLWHOI Library",
  "oreilly_books"=>"O'Reilly",
  "burstein"=>"The Burstein Alice in Wonderland Collection",
  "ucroho"=>"Regional Oral History Office",
  "Brandeis_University"=>"Brandeis University Libraries",
  "birney_anti_slavery_collection"=>"Birney Anti-Slavery Collection",
  "Johns_Hopkins_University"=>"The Johns Hopkins University Sheridan Libraries",
  "culturalandacademicfilms"=>"Cultural & Academic Films",
  "Harvard_University"=>"Harvard University",
  "montana_state_publications"=>"Montana State Government Publications",
  "national_institute_for_newman_studies"=>
    "National Institute for Newman Studies",
  "buddha"=>"Buddha Books",
  "university_of_toronto_fisher"=>"Thomas Fisher Rare Book Library",
  "ryerson_university"=>"Ryerson University",
  "university_of_toronto_emmanuel"=>
    "Emmanuel College Library, Victoria University",
  "unica"=>"Unica: Rare Books from UIUC",
  "mugar"=>"The Mugar Memorial Library, Boston University",
  "havergal"=>"Havergal College",
  "university_of_toronto_gerstein"=>
    "University of Toronto - Gerstein Science Information Centre",
  "NY_Botanical_Garden"=>"The New York Botanical Garden",
  "calacademy"=>"California Academy of Sciences",
  "chm_fiche"=>"Computer History Museum",
  "university_of_toronto_crrs"=>
    "Centre for Reformation and Renaissance Studies Library",
  "djo"=>"Dickens Journals Online",
  "unclibraries"=>"University of North Carolina at Chapel Hill",
  "university_of_toronto_oise"=>"OISE/UT Library",
  "newsandpublicaffairs"=>"News & Public Affairs",
  "biodiversity"=>"Biodiversity Heritage Library",
  "university_of_ottawa"=>"University of Ottawa",
  "Wellesley_College_Library"=>"Wellesley College Library",
  "audio_foreign"=>"Non-English Audio",
  "national_library_of_australia"=>"National Library of Australia",
  "datadumps"=>"Open Library Data",
  "microfilmreel"=>"Reels of Microfilm",
  "saint_marys_college"=>"Saint Mary's College of California",
  "university_of_toronto_pratt"=>"E.J. Pratt Library",
  "Boston_College_Library"=>"Boston College Library",
  "uchicago"=>"University of Chicago",
  "audio_podcast"=>"Podcasts",
  "tufts"=>"Tufts University",
  "opensource_audio"=>"Open Source Audio",
  "university_of_toronto_trinity"=>"John W. Graham Library, Trinity College",
  "audio_tech"=>"Computers & Technology",
  "moviesandfilms"=>"Movies",
  "etree"=>"Live Music Archive",
  "marcuslucero"=>"the Marucs Lucero",
  "opencontentalliance"=>"Open Content Alliance",
  "radioprograms"=>"Radio Programs",
  "university_of_toronto_pims"=>"PIMS - University of Toronto",
  "newspapers"=>"Newspapers",
  "university_of_california_libraries"=>"University of California Libraries",
  "millionbooks"=>"Million Book Project",
  "university_of_toronto_robarts"=>"University of Toronto - Robarts Library",
  "university_of_toronto"=>"University of Toronto",
  "montana_state_library"=>"Montana State Library",
  "bancroft_library"=>"The Bancroft Library",
  "prelinger_library"=>"Prelinger Library",
  "libraryofcongress"=>"The Library of Congress",
  "richtest"=>"Test books from California",
  "mobot"=>"Missouri Botanical Garden",
  "gamevideos"=>"Video Games",
  "blc"=>"The Boston Library Consortium",
  "cdl"=>"California Digital Library",
  "Princeton"=>"Princeton Theological Seminary",
  "mcmaster_university"=>"McMaster University",
  "sanfranciscopubliclibrary"=>"San Francisco Public Library",
  "spanish_texts"=>"The Spanish Language Library",
  "boston_college_libraries"=>"The Boston College Libraries",
  "gutenberg"=>"Project Gutenberg",
  "Music_UniversityofToronto"=>"Music - University of Toronto",
  "msn_books"=>"Microsoft",
  "youth_media"=>"Youth Media",
  "independent"=>"independent texts",
  "carletonlibrary"=>"Carleton University Library",
  "arpanet"=>"Arpanet",
  "yahoo_books"=>"Yahoo!",
  "johnadamsBPL"=>"The John Adams Library at the Boston Public Library",
  "library_of_congress"=>"The Library of Congress",
  "ColumbiaUniversityLibraries"=>"Columbia University Libraries",
  "university_of_guelph"=>"University of Guelph",
  "GratefulDead"=>"Grateful Dead",
  "audio_bookspoetry"=>"Audio Books & Poetry",
  "ncsulibraries"=>"North Carolina State University Libraries",
  "brown_university_library"=>"Brown University Library",
  "Allen_County_Public_Library"=>"Allen County Public Library",
  "yrlsc"=>"The Charles E. Young Research Library Special Collections",
  "torontotest"=>"Test books from Canada",
  "americana"=>"American Libraries",
  "librivoxaudio"=>"LibriVox",
  "audio_music"=>"Music & Arts",
  "toronto_public_library"=>"Toronto Public Library",
  "getty"=>"Research Library, Getty Research Institute",
  "ontla"=>"The Legislative Assembly of Ontario Collection",
  "TheChristianRadical"=>"The Christian Radical",
  "netlabels"=>"Netlabels",
  "newyorkpubliclibrary"=>"New York Public Library",
  "University_of_New_Hampshire_Library"=>"University of New Hampshire Library",
  "cbk"=>"Cook Books and Home Economics",
  "audio_news"=>"News & Public Affairs",
  "ant_texts"=>"Ant Texts",
  "computersandtechvideos"=>"Computers & Technology",
  "the_beat_within"=>"The Beat Within Magazine",
  "university_of_toronto_kelly"=>"University of Toronto - John M Kelly Library",
  "library_and_archives_canada"=>"Library and Archives Canada",
  "ephemera"=>"Ephemeral Films",
  "OXFAM"=>"Oxfam",
  "foreignlanguagevideos"=>"Non-English Videos",
  "MontanaStateLibrary"=>"Montana State Library",
  "EarthSciences_UniversityofToronto"=>"Earth Sciences University of Toronto",
  "octavo"=>"Octavo",
  "artsandmusicvideos"=>"Arts & Music"
}

Constants inherited from Service

Service::LinkOutFilterTask, Service::StandardTask

Instance Attribute Summary collapse

Attributes inherited from Service

#group, #name, #priority, #request, #service_id, #status, #task

Instance Method Summary collapse

Methods included from MetadataHelper

#get_doi, #get_epage, #get_gpo_item_nums, #get_identifier, #get_isbn, #get_issn, #get_lccn, #get_month, #get_oclcnum, #get_pmid, #get_search_creator, #get_search_terms, #get_search_title, #get_spage, #get_sudoc, #get_top_level_creator, #get_year, #normalize_lccn, #normalize_title, #raw_search_title, title_is_serial?

Methods included from MarcHelper

#add_856_links, #edition_statement, #get_title, #get_years, #gmd_values, #service_type_for_856, #should_skip_856_link?, #strip_gmd

Methods inherited from Service

#credits, #display_name, #handle_wrapper, #link_out_filter, #preempted_by, required_config_params, #translate

Constructor Details

#initialize(config) ⇒ InternetArchive

Returns a new instance of InternetArchive.



49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# File 'app/service_adaptors/internet_archive.rb', line 49

def initialize(config)
  # Default base URL for IA advanced search. We use this base link rather than
  # the this rather than the IA Solr index directly because IA suggests that 
  # the Solr home may change over time.
  @url = 'http://www.archive.org/advancedsearch.php?'
  # default number of results to return
  @num_results = 1
  # default IA mediatypes to search
  @mediatypes = ["texts", "audio"]
  # Should the web link to further results be shown? default to true
  @show_web_link = true
  @display_name = "the Internet Archive"
  @http_timeout = 5.seconds
  @include_search_inside = false
  
  @credits = {
    "The Internet Archive" => "http://archive.org/"
  }
  
  super(config)
  @num_results_for_types ||= {}
  @mediatypes.each do |type|
    @num_results_for_types[type] ||= @num_results
  end
end

Instance Attribute Details

#mediatypesObject (readonly)

No parameters are required, we have working defaults for them all.



31
32
33
# File 'app/service_adaptors/internet_archive.rb', line 31

def mediatypes
  @mediatypes
end

#num_resultsObject (readonly)

No parameters are required, we have working defaults for them all.



31
32
33
# File 'app/service_adaptors/internet_archive.rb', line 31

def num_results
  @num_results
end

#urlObject (readonly)

No parameters are required, we have working defaults for them all.



31
32
33
# File 'app/service_adaptors/internet_archive.rb', line 31

def url
  @url
end

Instance Method Details

#create_query_params(search_terms, type = nil) ⇒ Object

if given a type it will only search for one mediatype. otherwise it does an OR search for all configured mediatypes



219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# File 'app/service_adaptors/internet_archive.rb', line 219

def create_query_params(search_terms, type=nil)
  # Downcase params to avoid weird misconfiguration in IA's SOLR
  # installation, where it's interpreting uppercase words as
  # commands even within quotes. Also take out any parens in input.
  # Also IA does not semi-colons in input?!?
  title = safe_argument(search_terms[:title])
  
  
  params = 'title:' << CGI.escape('"' << title << '"')
  if (! search_terms[:creator].blank?)
    creator = safe_argument(search_terms[:creator])      
    params << '+AND+creator:' << CGI.escape('(' << creator << ')')       
  end
  mt = []
  params <<  '+AND+('
  if type
    params << 'mediatype:' << type
  else
    @mediatypes.each do |t|
      mt << ('mediatype:' << t)
    end
    params << mt.join('+OR+') 
  end
  params << ')' #closing the mediatypes with a paren
end

#create_result_url(result) ⇒ Object



191
192
193
# File 'app/service_adaptors/internet_archive.rb', line 191

def create_result_url(result)
  'http://archive.org/details/' + result['identifier']
end


211
212
213
214
215
# File 'app/service_adaptors/internet_archive.rb', line 211

def create_web_link_url(search_terms, type)
  'http://www.archive.org/search.php?query=' << create_query_params(search_terms, type)
  #url << CGI.escape('mediatype:' << type << ' AND ')
  
end

#do_query(request) ⇒ Object



84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# File 'app/service_adaptors/internet_archive.rb', line 84

def do_query(request)
  # get the search terms for use in both fulltext search and highlighted_link
  # IA does index apostrophes, although not generally other puncutation. Need to keep em.
  search_terms = {:title => get_search_title(request.referent ,:keep_apostrophes=>true),
  :creator => get_search_creator(request.referent)}
  

  
  # We need both title and author to continue
  return nil if (search_terms[:title].blank? || search_terms[:creator].blank?)

  # Return if this is an journal article link, an IA search can do nothing
  # for us except waste CPU cycles for us and IA.
   = request.referent.
  return nil unless ["atitle"].blank? &&
                    ["issue"].blank? &&
                    ["volume"].blank?
  
  # create one link that searches all configured mediatypes
  link = @url + ia_params(search_terms)
  
  # using open() conveniently follows the redirect for us. Alas, it
  # doesn't give us access to the IA http status code response though.
  response = nil
  timeout(@http_timeout.to_i) {
    response = open(link).read
  }

  if response.blank?
    raise Exception.new("InternetArchive returned empty response for #{link}")      
  end
  
  doc = MultiJson.load(response)
  results = doc['response']['docs']
  
  @mediatypes.each do |type|
    hits = matching_hits(request, search_terms, results, type)

   
    # if we have more results than we want to show in the main view
    # we can ceate a link (highlighted_link) to the search in the sidebar 

    num_found = hits.length #doc['response']['numFound']
    if (@show_web_link and not hits.empty? and @num_results_for_types[type] < num_found )
      do_web_link(request, search_terms, type, num_found) 
    end

    # Check for search inside only for first result of type 'text'
    if (@include_search_inside &&
        type == 'texts' &&
        (first_hit = hits[0]) && 
        (identifier = first_hit["identifier"])
        )
      direct_url = URI.parse("http://www.archive.org/stream/" + identifier)

      # Head request, if we get a 200, we think it means we have page
      # turner with search.
      req = Net::HTTP.new(direct_url.host, direct_url.port)
      response = req.request_head(direct_url.path)
      if response.code == "200"
        # search inside!
        request.add_service_response(
          :service => self,
          :display_text=> @display_name,
          :display_text_i18n => "display_name",
          :url => direct_url.to_s,
          :service_type_value => :search_inside
        )
      end        
    end



    # add a service response for each result for this mediatype
    hits.each_with_index do |result, index|
      break if index >= @num_results_for_types[type] 

      display_name = @display_name
      
      if result["contributor"] && result["contributor"].first
        display_name += ": " + result["contributor"].first
      elsif ( result["collection"] && COLLECTION_LABELS[result["collection"][0]])
        display_name += ": " + COLLECTION_LABELS[result["collection"][0]]
      end
      
      service_type = SERVICE_TYPE_MAP[type]
      request.add_service_response(
          :service=>self, 
          :display_text=>display_name, 
          :display_text_i18n => "display_name",
          :url=>create_result_url(result),
          :match_reliability => ServiceResponse::MatchUnsure,
          :edition_str => edition_str(result),
          :service_type_value => service_type )        
    end  
  end
end

displaying the num_found relies on the number of results from ia_params being enough to capture all results for a mediatype. If there are more potential results then num_found will not be accurate. But good enough.



198
199
200
201
202
203
204
205
206
207
208
209
# File 'app/service_adaptors/internet_archive.rb', line 198

def do_web_link(request, search_terms, type, num_found)
  display_text = "#{num_found} digital #{type.singularize} " + (num_found > 1 ? "files" : "file")

  
  url = create_web_link_url(search_terms, type)
  request.add_service_response(  
      :service=>self,    
      :url=>url,
      :display_text=>display_text, 
      :service_type_value => :highlighted_link   
   )
end

#edition_str(result) ⇒ Object



300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
# File 'app/service_adaptors/internet_archive.rb', line 300

def edition_str(result)
  edition_str = ""
  
  edition_str << result['title'] unless result['title'].blank?

  edition_str << " / #{result['creator'].first}" unless result['creator'].blank?
  edition_str << ". #{result["publisher"].first}" unless result['publisher'].blank?
  unless result['date'].blank?
    year = result['date'].slice(0,4)
    edition_str << ": #{year}"
  end
  
  edition_str = nil if edition_str.blank?

  return edition_str
end

#handle(request) ⇒ Object



75
76
77
78
79
80
81
82
# File 'app/service_adaptors/internet_archive.rb', line 75

def handle(request)
  begin
    do_query(request)
  rescue Timeout::Error => e
    return request.dispatched(self, false, e)
  end
  return request.dispatched(self, true)
end

#ia_params(search_terms) ⇒ Object

Here we create params in the format that the IA advanced search needs. These are solr-like params.



184
185
186
187
188
189
# File 'app/service_adaptors/internet_archive.rb', line 184

def ia_params(search_terms)
  return nil if search_terms[:title].nil?
  params = 'fl%5B%5D=*&fmt=json&xmlsearch=Search' #&indent=yes
  params << "&rows=999&q=" #is 999 too many or even too few?
  params << create_query_params(search_terms)   
end

#matching_hits(request, search_terms, results, type) ⇒ Object



265
266
267
268
269
270
271
272
273
274
# File 'app/service_adaptors/internet_archive.rb', line 265

def matching_hits(request, search_terms, results, type)    
  full_title = raw_search_title(request.referent)

  hits =  results.find_all do |r|      
    r["mediatype"] == type &&
    titles_sufficiently_matched(search_terms[:title], full_title, r["title"])   
  end

  return hits
end

#response_url(service_type, submitted_params) ⇒ Object

catch and redirect response_url fo rsearch_inside



318
319
320
321
322
323
324
325
326
327
# File 'app/service_adaptors/internet_archive.rb', line 318

def response_url(service_type, )
  if ( ! (service_type.service_type_value.name == "search_inside" ))
    return super(service_type, )
  else
    base = service_type.service_response[:url]
    query = CGI.escape(["query"] || "")
    url = base + "#search/#{query}"
    return url
  end
end

#safe_argument(string) ⇒ Object

used on what will be values stuck into a URL as search terms, does NOT cgi escape, but does safe-ify them in other ways for IA.



247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
# File 'app/service_adaptors/internet_archive.rb', line 247

def safe_argument(string)
  # Downcase params to avoid weird misconfiguration in IA's SOLR
  # installation, where it's interpreting uppercase words as
  # commands even within quotes. 
  output = string.downcase
  
  # Remove parens, semi-colons, brackets, hyphens, colons -- they all mess
  # up IA, which thinks they are special chars. Remove double quote,
  # special char, which sometimes we want to use ourselves. Replace
  # all with spaces to avoid accidentally conjoining words. 
  # (could be
  # escaping instead? Not worth it, we don't want to search
  # on these anyway. Remove ALL punctuation? Not sure.)
  output.gsub!(/[)(\]\[;"\=\-\:]/, ' ')
  
  return output
end

#service_types_generatedObject



39
40
41
42
43
44
45
46
47
# File 'app/service_adaptors/internet_archive.rb', line 39

def service_types_generated
  types = [ 
    ServiceTypeValue[:fulltext], 
    ServiceTypeValue[:audio],
    ServiceTypeValue[:'highlighted_link']      
    ]
  types << ServiceTypeValue[:search_inside] if @include_search_inside
  return types
end

#titles_sufficiently_matched(query_title, full_title, result) ⇒ Object

Some obtuse code to heuristically decide if our query title and a result title fuzzily match sufficiently to be considered a match.



278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
# File 'app/service_adaptors/internet_archive.rb', line 278

def titles_sufficiently_matched(query_title, full_title, result)    
  normalized_query      = normalize_title(query_title)
  normalized_full_title = normalize_title(full_title)
  # If the title has more than 3 words, and our IA query returned
  # a result for it -- that's probably good enough. 
  return true if normalized_query.split(" ").length > 3


  # Otherwise, make multiple versions of the candidate
  # title -- the whole thing, the title until the first colon,
  # and the title until the first comma or semi-colon or other punct. Normalize
  # them all. See if any of them match EITHER our search title or
  # our full title. 
  candidates = [
    result,
    result.split(":").first,
    result.split(/[\;\,\(\)]/).first
  ].compact.uniq.collect {|a| normalize_title(a)}
  
  return (candidates & [normalized_query, normalized_full_title]).present?
end