Module: Solrizer::HTML::Extractor

Defined in:: lib/solrizer/html/extractor.rb

Instance Method Summary collapse

#html_to_solr(ds, solr_doc = Hash.new) ⇒ Object

This method strips html tags out and returns content to be indexed in solr.

Instance Method Details

#html_to_solr(ds, solr_doc = Hash.new) ⇒ `Object`

This method strips html tags out and returns content to be indexed in solr

# File 'lib/solrizer/html/extractor.rb', line 11

def html_to_solr( ds, solr_doc=Hash.new )
  
  text = CGI.unescapeHTML(ds.content)
  doc = Nokogiri::HTML(text)
  
  # html to story_display
  stories = doc.xpath('//story')
      
  stories.each do |story|
    solr_doc.merge!({:story_display => story.children.to_xml})
  end
  
  #strip out text and put in story_t
  text_nodes = doc.xpath("//text()")
  text = String.new
  
   text_nodes.each do |text_node|
     text << text_node.content
   end
  
   solr_doc.merge!({:story_t => text})
   
   return solr_doc
end

Module: Solrizer::HTML::Extractor

Instance Method Summary collapse

Instance Method Details

#html_to_solr(ds, solr_doc = Hash.new) ⇒ Object

#html_to_solr(ds, solr_doc = Hash.new) ⇒ `Object`