Class: InternetArchive

Inherits:

Object
Service
InternetArchive

Includes:: MetadataHelper

Defined in:: app/service_adaptors/internet_archive.rb

Constant Summary collapse

SERVICE_TYPE_MAP = maps the IA mediatype to Umlaut service type

{
  "texts" => :fulltext,
  "audio" => :audio
}

COLLECTION_LABELS = collection labels list of collection labels can be found here: www.archive.org/advancedsearch.php?q=mediatype%3Acollection&fl[]=collection&fl[]=identifier&fl[]=title&sort[]=&sort[]=&sort[]=&rows=9999&indent=yes&fmt=json&xmlsearch=Search FIXME either get these dynamically at intervals or add a fuller set below. Currently there are over 4300 collections. If we’re going to do this as a static hash then it should be a class constant. Currently this hash contains a small selection of collections which include the ‘audio’ mediatype and all that contain the ‘texts’ mediatype.

{
  "CaliforniaFishandGame"=>"California Fish and Game",
  "ol_data"=>"Open Library Data",
  "worldhealthorganization"=>"World Health Organization",
  "opensource_movies"=>"Open Source Movies",
  "clairetcarneylibrary"=>
    "Claire T. Carney Library, University of Massachusetts Dartmouth",
  "university_of_illinois_urbana-champaign"=>
    "University of Illinois Urbana-Champaign",
  "smithsonian_books"=>"Smithsonian",
  "nhml_london"=>"Natural History Museum Library, London",
  "animationandcartoons"=>"Animation & Cartoons",
  "university_of_toronto_regis"=>"Regis College Library",
  "vlogs"=>"Vlogs",
  "opensource"=>"Open Source Books",
  "USGovernmentDocuments"=>"US Government Documents",
  "danceman"=>"Dance Manuals",
  "additional_collections"=>"Additional Collections",
  "internet_archive_books"=>"Internet Archive Books",
  "sloan"=>"Sloan Foundation",
  "iacl"=>"Children's Library",
  "audio_religion"=>"Spirituality & Religion",
  "microfilm"=>"Books from Microfilm",
  "toronto"=>"Canadian Libraries",
  "prelinger"=>"Prelinger Archives",
  "bostonpubliclibrary"=>"Boston Public Library",
  "sports"=>"Sports Videos",
  "universallibrary"=>"Universal Library",
  "sfpl"=>"The San Francisco Public Library",
  "university_of_toronto_knox"=>"Caven Library, Knox College",
  "memorial_university"=>"Memorial University of Newfoundland & Labrador",
  "MBLWHOI"=>"MBLWHOI Library",
  "oreilly_books"=>"O'Reilly",
  "burstein"=>"The Burstein Alice in Wonderland Collection",
  "ucroho"=>"Regional Oral History Office",
  "Brandeis_University"=>"Brandeis University Libraries",
  "birney_anti_slavery_collection"=>"Birney Anti-Slavery Collection",
  "Johns_Hopkins_University"=>"The Johns Hopkins University Sheridan Libraries",
  "culturalandacademicfilms"=>"Cultural & Academic Films",
  "Harvard_University"=>"Harvard University",
  "montana_state_publications"=>"Montana State Government Publications",
  "national_institute_for_newman_studies"=>
    "National Institute for Newman Studies",
  "buddha"=>"Buddha Books",
  "university_of_toronto_fisher"=>"Thomas Fisher Rare Book Library",
  "ryerson_university"=>"Ryerson University",
  "university_of_toronto_emmanuel"=>
    "Emmanuel College Library, Victoria University",
  "unica"=>"Unica: Rare Books from UIUC",
  "mugar"=>"The Mugar Memorial Library, Boston University",
  "havergal"=>"Havergal College",
  "university_of_toronto_gerstein"=>
    "University of Toronto - Gerstein Science Information Centre",
  "NY_Botanical_Garden"=>"The New York Botanical Garden",
  "calacademy"=>"California Academy of Sciences",
  "chm_fiche"=>"Computer History Museum",
  "university_of_toronto_crrs"=>
    "Centre for Reformation and Renaissance Studies Library",
  "djo"=>"Dickens Journals Online",
  "unclibraries"=>"University of North Carolina at Chapel Hill",
  "university_of_toronto_oise"=>"OISE/UT Library",
  "newsandpublicaffairs"=>"News & Public Affairs",
  "biodiversity"=>"Biodiversity Heritage Library",
  "university_of_ottawa"=>"University of Ottawa",
  "Wellesley_College_Library"=>"Wellesley College Library",
  "audio_foreign"=>"Non-English Audio",
  "national_library_of_australia"=>"National Library of Australia",
  "datadumps"=>"Open Library Data",
  "microfilmreel"=>"Reels of Microfilm",
  "saint_marys_college"=>"Saint Mary's College of California",
  "university_of_toronto_pratt"=>"E.J. Pratt Library",
  "Boston_College_Library"=>"Boston College Library",
  "uchicago"=>"University of Chicago",
  "audio_podcast"=>"Podcasts",
  "tufts"=>"Tufts University",
  "opensource_audio"=>"Open Source Audio",
  "university_of_toronto_trinity"=>"John W. Graham Library, Trinity College",
  "audio_tech"=>"Computers & Technology",
  "moviesandfilms"=>"Movies",
  "etree"=>"Live Music Archive",
  "marcuslucero"=>"the Marucs Lucero",
  "opencontentalliance"=>"Open Content Alliance",
  "radioprograms"=>"Radio Programs",
  "university_of_toronto_pims"=>"PIMS - University of Toronto",
  "newspapers"=>"Newspapers",
  "university_of_california_libraries"=>"University of California Libraries",
  "millionbooks"=>"Million Book Project",
  "university_of_toronto_robarts"=>"University of Toronto - Robarts Library",
  "university_of_toronto"=>"University of Toronto",
  "montana_state_library"=>"Montana State Library",
  "bancroft_library"=>"The Bancroft Library",
  "prelinger_library"=>"Prelinger Library",
  "libraryofcongress"=>"The Library of Congress",
  "richtest"=>"Test books from California",
  "mobot"=>"Missouri Botanical Garden",
  "gamevideos"=>"Video Games",
  "blc"=>"The Boston Library Consortium",
  "cdl"=>"California Digital Library",
  "Princeton"=>"Princeton Theological Seminary",
  "mcmaster_university"=>"McMaster University",
  "sanfranciscopubliclibrary"=>"San Francisco Public Library",
  "spanish_texts"=>"The Spanish Language Library",
  "boston_college_libraries"=>"The Boston College Libraries",
  "gutenberg"=>"Project Gutenberg",
  "Music_UniversityofToronto"=>"Music - University of Toronto",
  "msn_books"=>"Microsoft",
  "youth_media"=>"Youth Media",
  "independent"=>"independent texts",
  "carletonlibrary"=>"Carleton University Library",
  "arpanet"=>"Arpanet",
  "yahoo_books"=>"Yahoo!",
  "johnadamsBPL"=>"The John Adams Library at the Boston Public Library",
  "library_of_congress"=>"The Library of Congress",
  "ColumbiaUniversityLibraries"=>"Columbia University Libraries",
  "university_of_guelph"=>"University of Guelph",
  "GratefulDead"=>"Grateful Dead",
  "audio_bookspoetry"=>"Audio Books & Poetry",
  "ncsulibraries"=>"North Carolina State University Libraries",
  "brown_university_library"=>"Brown University Library",
  "Allen_County_Public_Library"=>"Allen County Public Library",
  "yrlsc"=>"The Charles E. Young Research Library Special Collections",
  "torontotest"=>"Test books from Canada",
  "americana"=>"American Libraries",
  "librivoxaudio"=>"LibriVox",
  "audio_music"=>"Music & Arts",
  "toronto_public_library"=>"Toronto Public Library",
  "getty"=>"Research Library, Getty Research Institute",
  "ontla"=>"The Legislative Assembly of Ontario Collection",
  "TheChristianRadical"=>"The Christian Radical",
  "netlabels"=>"Netlabels",
  "newyorkpubliclibrary"=>"New York Public Library",
  "University_of_New_Hampshire_Library"=>"University of New Hampshire Library",
  "cbk"=>"Cook Books and Home Economics",
  "audio_news"=>"News & Public Affairs",
  "ant_texts"=>"Ant Texts",
  "computersandtechvideos"=>"Computers & Technology",
  "the_beat_within"=>"The Beat Within Magazine",
  "university_of_toronto_kelly"=>"University of Toronto - John M Kelly Library",
  "library_and_archives_canada"=>"Library and Archives Canada",
  "ephemera"=>"Ephemeral Films",
  "OXFAM"=>"Oxfam",
  "foreignlanguagevideos"=>"Non-English Videos",
  "MontanaStateLibrary"=>"Montana State Library",
  "EarthSciences_UniversityofToronto"=>"Earth Sciences University of Toronto",
  "octavo"=>"Octavo",
  "artsandmusicvideos"=>"Arts & Music"
}

Constants inherited from Service

Service::LinkOutFilterTask, Service::StandardTask

Instance Attribute Summary collapse

#mediatypes ⇒ Object readonly

No parameters are required, we have working defaults for them all.
#num_results ⇒ Object readonly

No parameters are required, we have working defaults for them all.
#url ⇒ Object readonly

No parameters are required, we have working defaults for them all.

Attributes inherited from Service

#group, #name, #priority, #request, #service_id, #status, #task

Instance Method Summary collapse

#create_query_params(search_terms, type = nil) ⇒ Object

if given a type it will only search for one mediatype.
#create_result_url(result) ⇒ Object
#create_web_link_url(search_terms, type) ⇒ Object
#do_query(request) ⇒ Object
#do_web_link(request, search_terms, type, num_found) ⇒ Object

displaying the num_found relies on the number of results from ia_params being enough to capture all results for a mediatype.
#edition_str(result) ⇒ Object
#handle(request) ⇒ Object
#ia_params(search_terms) ⇒ Object

Here we create params in the format that the IA advanced search needs.
#initialize(config) ⇒ InternetArchive constructor

A new instance of InternetArchive.
#matching_hits(request, search_terms, results, type) ⇒ Object
#response_url(service_type, submitted_params) ⇒ Object

catch and redirect response_url fo rsearch_inside.
#safe_argument(string) ⇒ Object

used on what will be values stuck into a URL as search terms, does NOT cgi escape, but does safe-ify them in other ways for IA.
#service_types_generated ⇒ Object
#titles_sufficiently_matched(query_title, full_title, result) ⇒ Object

Some obtuse code to heuristically decide if our query title and a result title fuzzily match sufficiently to be considered a match.

Constructor Details

#initialize(config) ⇒ `InternetArchive`

Returns a new instance of InternetArchive.

# File 'app/service_adaptors/internet_archive.rb', line 49

def initialize(config)
  # Default base URL for IA advanced search. We use this base link rather than
  # the this rather than the IA Solr index directly because IA suggests that 
  # the Solr home may change over time.
  @url = 'http://www.archive.org/advancedsearch.php?'
  # default number of results to return
  @num_results = 1
  # default IA mediatypes to search
  @mediatypes = ["texts", "audio"]
  # Should the web link to further results be shown? default to true
  @show_web_link = true
  @display_name = "the Internet Archive"
  @http_timeout = 5.seconds
  @include_search_inside = false
  
  @credits = {
    "The Internet Archive" => "http://archive.org/"
  }
  
  super(config)
  @num_results_for_types ||= {}
  @mediatypes.each do |type|
    @num_results_for_types[type] ||= @num_results
  end
end

Instance Attribute Details

#mediatypes ⇒ `Object` (readonly)

No parameters are required, we have working defaults for them all.



31
32
33

# File 'app/service_adaptors/internet_archive.rb', line 31

def mediatypes
  @mediatypes
end

#num_results ⇒ `Object` (readonly)

No parameters are required, we have working defaults for them all.



31
32
33

# File 'app/service_adaptors/internet_archive.rb', line 31

def num_results
  @num_results
end

#url ⇒ `Object` (readonly)

No parameters are required, we have working defaults for them all.



31
32
33

# File 'app/service_adaptors/internet_archive.rb', line 31

def url
  @url
end

Instance Method Details

#create_query_params(search_terms, type = nil) ⇒ `Object`

if given a type it will only search for one mediatype. otherwise it does an OR search for all configured mediatypes

# File 'app/service_adaptors/internet_archive.rb', line 219

def create_query_params(search_terms, type=nil)
  # Downcase params to avoid weird misconfiguration in IA's SOLR
  # installation, where it's interpreting uppercase words as
  # commands even within quotes. Also take out any parens in input.
  # Also IA does not semi-colons in input?!?
  title = safe_argument(search_terms[:title])
  
  
  params = 'title:' << CGI.escape('"' << title << '"')
  if (! search_terms[:creator].blank?)
    creator = safe_argument(search_terms[:creator])      
    params << '+AND+creator:' << CGI.escape('(' << creator << ')')       
  end
  mt = []
  params <<  '+AND+('
  if type
    params << 'mediatype:' << type
  else
    @mediatypes.each do |t|
      mt << ('mediatype:' << t)
    end
    params << mt.join('+OR+') 
  end
  params << ')' #closing the mediatypes with a paren
end

#create_result_url(result) ⇒ `Object`



191
192
193

# File 'app/service_adaptors/internet_archive.rb', line 191

def create_result_url(result)
  'http://archive.org/details/' + result['identifier']
end

#create_web_link_url(search_terms, type) ⇒ `Object`

# File 'app/service_adaptors/internet_archive.rb', line 211

def create_web_link_url(search_terms, type)
  'http://www.archive.org/search.php?query=' << create_query_params(search_terms, type)
  #url << CGI.escape('mediatype:' << type << ' AND ')
  
end

#do_query(request) ⇒ `Object`

# File 'app/service_adaptors/internet_archive.rb', line 84

def do_query(request)
  # get the search terms for use in both fulltext search and highlighted_link
  # IA does index apostrophes, although not generally other puncutation. Need to keep em.
  search_terms = {:title => get_search_title(request.referent ,:keep_apostrophes=>true),
  :creator => get_search_creator(request.referent)}
  

  
  # We need both title and author to continue
  return nil if (search_terms[:title].blank? || search_terms[:creator].blank?)

  # Return if this is an journal article link, an IA search can do nothing
  # for us except waste CPU cycles for us and IA.
  metadata = request.referent.metadata
  return nil unless metadata["atitle"].blank? &&
                    metadata["issue"].blank? &&
                    metadata["volume"].blank?
  
  # create one link that searches all configured mediatypes
  link = @url + ia_params(search_terms)
  
  # using open() conveniently follows the redirect for us. Alas, it
  # doesn't give us access to the IA http status code response though.
  response = nil
  timeout(@http_timeout.to_i) {
    response = open(link).read
  }

  if response.blank?
    raise Exception.new("InternetArchive returned empty response for #{link}")      
  end
  
  doc = MultiJson.load(response)
  results = doc['response']['docs']
  
  @mediatypes.each do |type|
    hits = matching_hits(request, search_terms, results, type)

   
    # if we have more results than we want to show in the main view
    # we can ceate a link (highlighted_link) to the search in the sidebar 

    num_found = hits.length #doc['response']['numFound']
    if (@show_web_link and not hits.empty? and @num_results_for_types[type] < num_found )
      do_web_link(request, search_terms, type, num_found) 
    end

    # Check for search inside only for first result of type 'text'
    if (@include_search_inside &&
        type == 'texts' &&
        (first_hit = hits[0]) && 
        (identifier = first_hit["identifier"])
        )
      direct_url = URI.parse("http://www.archive.org/stream/" + identifier)

      # Head request, if we get a 200, we think it means we have page
      # turner with search.
      req = Net::HTTP.new(direct_url.host, direct_url.port)
      response = req.request_head(direct_url.path)
      if response.code == "200"
        # search inside!
        request.add_service_response(
          :service => self,
          :display_text=> @display_name,
          :display_text_i18n => "display_name",
          :url => direct_url.to_s,
          :service_type_value => :search_inside
        )
      end        
    end



    # add a service response for each result for this mediatype
    hits.each_with_index do |result, index|
      break if index >= @num_results_for_types[type] 

      display_name = @display_name
      
      if result["contributor"] && result["contributor"].first
        display_name += ": " + result["contributor"].first
      elsif ( result["collection"] && COLLECTION_LABELS[result["collection"][0]])
        display_name += ": " + COLLECTION_LABELS[result["collection"][0]]
      end
      
      service_type = SERVICE_TYPE_MAP[type]
      request.add_service_response(
          :service=>self, 
          :display_text=>display_name, 
          :display_text_i18n => "display_name",
          :url=>create_result_url(result),
          :match_reliability => ServiceResponse::MatchUnsure,
          :edition_str => edition_str(result),
          :service_type_value => service_type )        
    end  
  end
end

#do_web_link(request, search_terms, type, num_found) ⇒ `Object`

displaying the num_found relies on the number of results from ia_params being enough to capture all results for a mediatype. If there are more potential results then num_found will not be accurate. But good enough.

# File 'app/service_adaptors/internet_archive.rb', line 198

def do_web_link(request, search_terms, type, num_found)
  display_text = "#{num_found} digital #{type.singularize} " + (num_found > 1 ? "files" : "file")

  
  url = create_web_link_url(search_terms, type)
  request.add_service_response(  
      :service=>self,    
      :url=>url,
      :display_text=>display_text, 
      :service_type_value => :highlighted_link   
   )
end

#edition_str(result) ⇒ `Object`

# File 'app/service_adaptors/internet_archive.rb', line 300

def edition_str(result)
  edition_str = ""
  
  edition_str << result['title'] unless result['title'].blank?

  edition_str << " / #{result['creator'].first}" unless result['creator'].blank?
  edition_str << ". #{result["publisher"].first}" unless result['publisher'].blank?
  unless result['date'].blank?
    year = result['date'].slice(0,4)
    edition_str << ": #{year}"
  end
  
  edition_str = nil if edition_str.blank?

  return edition_str
end

#handle(request) ⇒ `Object`

# File 'app/service_adaptors/internet_archive.rb', line 75

def handle(request)
  begin
    do_query(request)
  rescue Timeout::Error => e
    return request.dispatched(self, false, e)
  end
  return request.dispatched(self, true)
end

#ia_params(search_terms) ⇒ `Object`

Here we create params in the format that the IA advanced search needs. These are solr-like params.

# File 'app/service_adaptors/internet_archive.rb', line 184

def ia_params(search_terms)
  return nil if search_terms[:title].nil?
  params = 'fl%5B%5D=*&fmt=json&xmlsearch=Search' #&indent=yes
  params << "&rows=999&q=" #is 999 too many or even too few?
  params << create_query_params(search_terms)   
end

#matching_hits(request, search_terms, results, type) ⇒ `Object`

# File 'app/service_adaptors/internet_archive.rb', line 265

def matching_hits(request, search_terms, results, type)    
  full_title = raw_search_title(request.referent)

  hits =  results.find_all do |r|      
    r["mediatype"] == type &&
    titles_sufficiently_matched(search_terms[:title], full_title, r["title"])   
  end

  return hits
end

#response_url(service_type, submitted_params) ⇒ `Object`

catch and redirect response_url fo rsearch_inside

# File 'app/service_adaptors/internet_archive.rb', line 318

def response_url(service_type, submitted_params)
  if ( ! (service_type.service_type_value.name == "search_inside" ))
    return super(service_type, submitted_params)
  else
    base = service_type.service_response[:url]
    query = CGI.escape(submitted_params["query"] || "")
    url = base + "#search/#{query}"
    return url
  end
end

#safe_argument(string) ⇒ `Object`

used on what will be values stuck into a URL as search terms, does NOT cgi escape, but does safe-ify them in other ways for IA.

# File 'app/service_adaptors/internet_archive.rb', line 247

def safe_argument(string)
  # Downcase params to avoid weird misconfiguration in IA's SOLR
  # installation, where it's interpreting uppercase words as
  # commands even within quotes. 
  output = string.downcase
  
  # Remove parens, semi-colons, brackets, hyphens, colons -- they all mess
  # up IA, which thinks they are special chars. Remove double quote,
  # special char, which sometimes we want to use ourselves. Replace
  # all with spaces to avoid accidentally conjoining words. 
  # (could be
  # escaping instead? Not worth it, we don't want to search
  # on these anyway. Remove ALL punctuation? Not sure.)
  output.gsub!(/[)(\]\[;"\=\-\:]/, ' ')
  
  return output
end

#service_types_generated ⇒ `Object`

# File 'app/service_adaptors/internet_archive.rb', line 39

def service_types_generated
  types = [ 
    ServiceTypeValue[:fulltext], 
    ServiceTypeValue[:audio],
    ServiceTypeValue[:'highlighted_link']      
    ]
  types << ServiceTypeValue[:search_inside] if @include_search_inside
  return types
end

#titles_sufficiently_matched(query_title, full_title, result) ⇒ `Object`

Some obtuse code to heuristically decide if our query title and a result title fuzzily match sufficiently to be considered a match.

# File 'app/service_adaptors/internet_archive.rb', line 278

def titles_sufficiently_matched(query_title, full_title, result)    
  normalized_query      = normalize_title(query_title)
  normalized_full_title = normalize_title(full_title)
  # If the title has more than 3 words, and our IA query returned
  # a result for it -- that's probably good enough. 
  return true if normalized_query.split(" ").length > 3


  # Otherwise, make multiple versions of the candidate
  # title -- the whole thing, the title until the first colon,
  # and the title until the first comma or semi-colon or other punct. Normalize
  # them all. See if any of them match EITHER our search title or
  # our full title. 
  candidates = [
    result,
    result.split(":").first,
    result.split(/[\;\,\(\)]/).first
  ].compact.uniq.collect {|a| normalize_title(a)}
  
  return (candidates & [normalized_query, normalized_full_title]).present?
end

Class: InternetArchive

Constant Summary collapse

Constants inherited from Service

Instance Attribute Summary collapse

Attributes inherited from Service

Instance Method Summary collapse

Methods included from MetadataHelper

Methods included from MarcHelper

Methods inherited from Service

Constructor Details

#initialize(config) ⇒ InternetArchive

Instance Attribute Details

#mediatypes ⇒ Object (readonly)

#num_results ⇒ Object (readonly)

#url ⇒ Object (readonly)

Instance Method Details

#create_query_params(search_terms, type = nil) ⇒ Object

#create_result_url(result) ⇒ Object

#create_web_link_url(search_terms, type) ⇒ Object

#do_query(request) ⇒ Object

#do_web_link(request, search_terms, type, num_found) ⇒ Object

#edition_str(result) ⇒ Object

#handle(request) ⇒ Object

#ia_params(search_terms) ⇒ Object

#matching_hits(request, search_terms, results, type) ⇒ Object

#response_url(service_type, submitted_params) ⇒ Object

#safe_argument(string) ⇒ Object

#service_types_generated ⇒ Object

#titles_sufficiently_matched(query_title, full_title, result) ⇒ Object

#initialize(config) ⇒ `InternetArchive`

#mediatypes ⇒ `Object` (readonly)

#num_results ⇒ `Object` (readonly)

#url ⇒ `Object` (readonly)

#create_query_params(search_terms, type = nil) ⇒ `Object`

#create_result_url(result) ⇒ `Object`

#create_web_link_url(search_terms, type) ⇒ `Object`

#do_query(request) ⇒ `Object`

#do_web_link(request, search_terms, type, num_found) ⇒ `Object`

#edition_str(result) ⇒ `Object`

#handle(request) ⇒ `Object`

#ia_params(search_terms) ⇒ `Object`

#matching_hits(request, search_terms, results, type) ⇒ `Object`

#response_url(service_type, submitted_params) ⇒ `Object`

#safe_argument(string) ⇒ `Object`

#service_types_generated ⇒ `Object`

#titles_sufficiently_matched(query_title, full_title, result) ⇒ `Object`