Class: ArticleJSON::Import::GoogleDoc::HTML::NodeAnalyzer

Inherits:
Object
  • Object
show all
Defined in:
lib/article_json/import/google_doc/html/node_analyzer.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(node) ⇒ NodeAnalyzer

Returns a new instance of NodeAnalyzer.

Parameters:

  • node (Nokogiri::HTML::Node)


9
10
11
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 9

def initialize(node)
  @node = node
end

Instance Attribute Details

#nodeObject (readonly)

Returns the value of attribute node.



6
7
8
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 6

def node
  @node
end

Instance Method Details

#begins_with?(text) ⇒ Boolean

Check if the node text begins with a certain text

Parameters:

  • (String)

Returns:

  • (Boolean)


23
24
25
26
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 23

def begins_with?(text)
  first_word = node.inner_text.strip.downcase.split(' ').first
  first_word == text.strip.downcase
end

#br?Boolean

Check if the node is a linebreak. A span only containing whitespaces and
tags is considered a linebreak.

Returns:

  • (Boolean)


114
115
116
117
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 114

def br?
  return @is_br if defined? @is_br
  @is_br = node.name == 'br' || only_includes_brs?
end

#embed?Boolean

Check if the node contains an embedded element

Returns:

  • (Boolean)


106
107
108
109
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 106

def embed?
  return @is_embed if defined? @is_embed
  @is_embed = EmbeddedParser.supported?(node)
end

#empty?Boolean

Check if the node is empty, i.e. not containing any text Given that images are the only nodes without text, we have to make sure that it’s not an image.

Returns:

  • (Boolean)


32
33
34
35
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 32

def empty?
  return @is_empty if defined? @is_empty
  @is_empty = node.inner_text.strip.empty? && !image? && !hr? && !br?
end

#has_text?(text) ⇒ Boolean

Check if a node equals a certain text

Parameters:

  • text (String)

Returns:

  • (Boolean)


16
17
18
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 16

def has_text?(text)
  node.inner_text.strip.downcase == text.strip.downcase
end

#heading?Boolean

Check if the node is a header tag between <h1> and <h5>

Returns:

  • (Boolean)


39
40
41
42
43
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 39

def heading?
  return @is_heading if defined? @is_heading
  @is_heading =
    !quote? && !text_box? && %w(h1 h2 h3 h4 h5).include?(node.name)
end

#hr?Boolean

Check if the node is a horizontal line (i.e. ‘<hr>`)

Returns:

  • (Boolean)


47
48
49
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 47

def hr?
  node.name == 'hr'
end

#image?Boolean

Check if the node contains an image

Returns:

  • (Boolean)


89
90
91
92
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 89

def image?
  return @is_image if defined? @is_image
  @is_image = image_url? || node.xpath('.//img').length > 0
end

#image_url?Boolean

Check if the node contains an image URL

Returns:

  • (Boolean)


96
97
98
99
100
101
102
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 96

def image_url?
  return @is_image_url if defined? @is_image_url

  text = node.inner_text.strip
  url_regexp = %r{https?:\/\/\S+\.(?:jpg|jpeg|png|gif)}i
  @is_image_url = !!(url_regexp =~ text)
end

#list?Boolean

Check if the node contains an ordered or unordered list

Returns:

  • (Boolean)


66
67
68
69
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 66

def list?
  return @is_list if defined? @is_list
  @is_list = %w(ul ol).include?(node.name)
end

#paragraph?Boolean

Check if the node is a normal text paragraph

Returns:

  • (Boolean)


53
54
55
56
57
58
59
60
61
62
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 53

def paragraph?
  return @is_paragraph if defined? @is_paragraph
  @is_paragraph =
    node.name == 'p' &&
      !empty? &&
      !image? &&
      !text_box? &&
      !quote? &&
      !embed?
end

#quote?Boolean

Check if the node starts a quote Quotes start with a single line saying “Quote:”.

Returns:

  • (Boolean)


82
83
84
85
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 82

def quote?
  return @is_quote if defined? @is_quote
  @is_quote = has_text?('quote:')
end

#text_box?Boolean

Check if the node starts a text box Text boxes start with a single line saying “Textbox:” or “Highlight:”.

Returns:

  • (Boolean)


74
75
76
77
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 74

def text_box?
  return @is_text_box if defined? @is_text_box
  @is_text_box = begins_with?('textbox:') || begins_with?('highlight:')
end

#typeSymbol

Determine the type of this node The type is one of the elements supported by article_json.

Returns:

  • (Symbol)


122
123
124
125
126
127
128
129
130
131
132
133
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 122

def type
  return :empty if empty?
  return :hr if hr?
  return :heading if heading?
  return :paragraph if paragraph?
  return :list if list?
  return :text_box if text_box?
  return :quote if quote?
  return :image if image?
  return :embed if embed?
  :unknown
end