Module: ParsePage
- Included in:
- GeneralScraper
- Defined in:
- lib/parse_page.rb
Instance Method Summary collapse
- #fixEncode(str) ⇒ Object
-
#getContent(url, pagehash, html) ⇒ Object
Get the page content by type of page.
-
#getHTMLText(url, pagehash, html) ⇒ Object
Download the page text.
-
#getMetadata(url, html) ⇒ Object
Get the page metadata.
-
#getPageData(url) ⇒ Object
Get both page metadata and text.
-
#getPDF(url, pagehash) ⇒ Object
Download and extract text from PDF.
Instance Method Details
#fixEncode(str) ⇒ Object
64 65 66 67 68 69 70 |
# File 'lib/parse_page.rb', line 64 def fixEncode(str) if str.is_a?(String) return str.unpack('C*').pack('U*') else return str end end |
#getContent(url, pagehash, html) ⇒ Object
Get the page content by type of page
17 18 19 20 21 22 23 |
# File 'lib/parse_page.rb', line 17 def getContent(url, pagehash, html) if url.include? ".pdf" return getPDF(url, pagehash) else return getHTMLText(url, pagehash, html) end end |
#getHTMLText(url, pagehash, html) ⇒ Object
Download the page text
26 27 28 29 |
# File 'lib/parse_page.rb', line 26 def getHTMLText(url, pagehash, html) pagehash[:text] = fixEncode(html.css("body").text) return pagehash end |
#getMetadata(url, html) ⇒ Object
Get the page metadata
44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
# File 'lib/parse_page.rb', line 44 def getMetadata(url, html) pagehash = Hash.new # Save URL and date retreived url.gsub!("%3F", "?") url.gsub!("%3D", "=") pagehash[:url] = url pagehash[:date_retrieved] = Time.now # Get title and meta tag info pagehash[:title] = fixEncode(html.css("title").text) html.css("meta").each do |m| if m pagehash[m['name']] = fixEncode(m['content']) end end return pagehash end |
#getPageData(url) ⇒ Object
Get both page metadata and text
5 6 7 8 9 10 11 12 13 14 |
# File 'lib/parse_page.rb', line 5 def getPageData(url) begin page = @requests.get_page(url) html = Nokogiri::HTML(page) pagehash = getMetadata(url, html) pagehash = getContent(url, pagehash, html) @output.push(pagehash) rescue end end |
#getPDF(url, pagehash) ⇒ Object
Download and extract text from PDF
32 33 34 35 36 37 38 39 40 41 |
# File 'lib/parse_page.rb', line 32 def getPDF(url, pagehash) `wget -P public/uploads #{url}` path = url.split("/") # OCR PDF and save fields u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip) pdfparse = JSON.parse(u.handleDoc) pdfparse.each{|k, v| pagehash[k] = fixEncode(v)} return pagehash end |