Class: ChupaText::Decomposers::WebKit

Inherits:
Decomposer
  • Object
show all
Includes:
LogTag, Loggable
Defined in:
lib/chupa-text/decomposers/webkit.rb

Defined Under Namespace

Modules: LogTag Classes: ExternalScreenshoter

Constant Summary collapse

TARGET_EXTENSIONS =
["htm", "html", "xhtml"]
TARGET_MIME_TYPES =
[
  "text/html",
  "application/xhtml+xml",
]
AVAILABLE_ATTRIBUTE_NAME =
"decomposer-webkit-screenshot-available"
IN_PROCESS =
ENV["CHUPA_TEXT_DECOMPOSER_WEBKIT_IN_PROCESS"] == "yes"

Instance Method Summary collapse

Instance Method Details

#decompose(data) {|data| ... } ⇒ Object

Yields:

  • (data)


66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# File 'lib/chupa-text/decomposers/webkit.rb', line 66

def decompose(data)
  body = data.source.body
  uri = data.source.uri.to_s
  output = Tempfile.new(["chupa-text-decomposer-webkit", ".png"])
  width, height = data.expected_screenshot_size
  if IN_PROCESS
    screenshoter = ChupaTextDecomposerWebKit::Screenshoter.new(logger)
    screenshoter.run(body, uri, output.path, width, height)
  else
    screenshoter = ExternalScreenshoter.new
    screenshoter.run(data.source.path, uri, output.path, width, height)
  end
  unless File.size(output.path).zero?
    png = output.read
    data.screenshot = Screenshot.new("image/png",
                                     [png].pack("m*"),
                                     "base64")
  end
  data[AVAILABLE_ATTRIBUTE_NAME] = !data.screenshot.nil?
  yield(data)
end

#target?(data) ⇒ Boolean

Returns:

  • (Boolean)


41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# File 'lib/chupa-text/decomposers/webkit.rb', line 41

def target?(data)
  return false unless data.need_screenshot?
  return false if data.screenshot
  return false unless data[AVAILABLE_ATTRIBUTE_NAME].nil?

  source = data.source
  return false if source.nil?

  return true if TARGET_EXTENSIONS.include?(source.extension)
  return true if TARGET_MIME_TYPES.include?(source.mime_type)

  source_body = source.body
  return false if source_body.nil?

  return true if source_body.start_with?("<!DOCTYPE html ")
  return true if source_body.start_with?("<html")

  false
end