Module: ParsePage

Included in:
GeneralScraper
Defined in:
lib/parse_page.rb

Instance Method Summary collapse

Instance Method Details

#fixEncode(str) ⇒ Object



64
65
66
67
68
69
70
# File 'lib/parse_page.rb', line 64

def fixEncode(str)
  if str.is_a?(String)
    return str.unpack('C*').pack('U*')
  else
    return str
  end
end

#getContent(url, pagehash, html) ⇒ Object

Get the page content by type of page



17
18
19
20
21
22
23
# File 'lib/parse_page.rb', line 17

def getContent(url, pagehash, html)
  if url.include? ".pdf"
    return getPDF(url, pagehash)
  else
    return getHTMLText(url, pagehash, html)
  end
end

#getHTMLText(url, pagehash, html) ⇒ Object

Download the page text



26
27
28
29
# File 'lib/parse_page.rb', line 26

def getHTMLText(url, pagehash, html)
  pagehash[:text] = fixEncode(html.css("body").text)
  return pagehash
end

#getMetadata(url, html) ⇒ Object

Get the page metadata



44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# File 'lib/parse_page.rb', line 44

def (url, html)
  pagehash = Hash.new

  # Save URL and date retreived
  url.gsub!("%3F", "?")
  url.gsub!("%3D", "=")
  pagehash[:url] = url
  pagehash[:date_retrieved] = Time.now

  # Get title and meta tag info
  pagehash[:title] = fixEncode(html.css("title").text)
  html.css("meta").each do |m|
    if m
      pagehash[m['name']] = fixEncode(m['content'])
    end
  end

  return pagehash
end

#getPageData(url) ⇒ Object

Get both page metadata and text



5
6
7
8
9
10
11
12
13
14
# File 'lib/parse_page.rb', line 5

def getPageData(url)
  begin
    page = @requests.get_page(url)
    html = Nokogiri::HTML(page)
    pagehash = (url, html)
    pagehash = getContent(url, pagehash, html)
    @output.push(pagehash)
  rescue
  end
end

#getPDF(url, pagehash) ⇒ Object

Download and extract text from PDF



32
33
34
35
36
37
38
39
40
41
# File 'lib/parse_page.rb', line 32

def getPDF(url, pagehash)
  `wget -P public/uploads #{url}`
  path = url.split("/")

  # OCR PDF and save fields
  u = UploadConvert.new("public/uploads/" + path[path.length-1].chomp.strip)
  pdfparse = JSON.parse(u.handleDoc)
  pdfparse.each{|k, v| pagehash[k] = fixEncode(v)}
  return pagehash
end