Class: ArticleJSON::Import::GoogleDoc::HTML::NodeAnalyzer

Inherits:
Object
  • Object
show all
Defined in:
lib/article_json/import/google_doc/html/node_analyzer.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(node) ⇒ NodeAnalyzer

Returns a new instance of NodeAnalyzer.

Parameters:



9
10
11
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 9

def initialize(node)
  @node = node
end

Instance Attribute Details

#nodeObject (readonly)

Returns the value of attribute node.



6
7
8
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 6

def node
  @node
end

Instance Method Details

#begins_with?(text) ⇒ Boolean

Check if the node text begins with a certain text

Parameters:

Returns:



23
24
25
26
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 23

def begins_with?(text)
  first_word = node.inner_text.strip.downcase.split(' ').first
  first_word == text.strip.downcase
end

#br?Boolean

Check if the node is a linebreak. A span only containing whitespaces and
tags is considered a linebreak.

Returns:



122
123
124
125
126
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 122

def br?
  return @is_br if defined? @is_br

  @is_br = node.name == 'br' || only_includes_brs?
end

#embed?Boolean

Check if the node contains an embedded element

Returns:



113
114
115
116
117
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 113

def embed?
  return @is_embed if defined? @is_embed

  @is_embed = EmbeddedParser.supported?(node)
end

#empty?Boolean

Check if the node is empty, i.e. not containing any text Given that images are the only nodes without text, we have to make sure that it’s not an image.

Returns:



32
33
34
35
36
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 32

def empty?
  return @is_empty if defined? @is_empty

  @is_empty = node.inner_text.strip.empty? && !image? && !hr? && !br?
end

#has_text?(text) ⇒ Boolean

Check if a node equals a certain text

Parameters:

Returns:



16
17
18
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 16

def has_text?(text)
  node.inner_text.strip.downcase == text.strip.downcase
end

#heading?Boolean

Check if the node is a header tag between <h1> and <h5>

Returns:



40
41
42
43
44
45
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 40

def heading?
  return @is_heading if defined? @is_heading

  @is_heading =
    !quote? && !text_box? && %w(h1 h2 h3 h4 h5).include?(node.name)
end

#hr?Boolean

Check if the node is a horizontal line (i.e. ‘<hr>`)

Returns:



49
50
51
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 49

def hr?
  node.name == 'hr'
end

#image?Boolean

Check if the node contains an image

Returns:



95
96
97
98
99
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 95

def image?
  return @is_image if defined? @is_image

  @is_image = image_url? || node.xpath('.//img').length > 0
end

#image_url?Boolean

Check if the node contains an image URL

Returns:



103
104
105
106
107
108
109
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 103

def image_url?
  return @is_image_url if defined? @is_image_url

  text = node.inner_text.strip
  url_regexp = %r{https?:\/\/\S+\.(?:jpg|jpeg|png|gif)}i
  @is_image_url = !!(url_regexp =~ text)
end

#list?Boolean

Check if the node contains an ordered or unordered list

Returns:



69
70
71
72
73
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 69

def list?
  return @is_list if defined? @is_list

  @is_list = %w[ul ol].include?(node.name)
end

#paragraph?Boolean

Check if the node is a normal text paragraph

Returns:



55
56
57
58
59
60
61
62
63
64
65
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 55

def paragraph?
  return @is_paragraph if defined? @is_paragraph

  @is_paragraph =
    node.name == 'p' &&
      !empty? &&
      !image? &&
      !text_box? &&
      !quote? &&
      !embed?
end

#quote?Boolean

Check if the node starts a quote Quotes start with a single line saying “Quote:”.

Returns:



87
88
89
90
91
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 87

def quote?
  return @is_quote if defined? @is_quote

  @is_quote = has_text?('quote:')
end

#text_box?Boolean

Check if the node starts a text box Text boxes start with a single line saying “Textbox:” or “Highlight:”.

Returns:



78
79
80
81
82
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 78

def text_box?
  return @is_text_box if defined? @is_text_box

  @is_text_box = begins_with?('textbox:') || begins_with?('highlight:')
end

#typeSymbol

Determine the type of this node The type is one of the elements supported by article_json.

Returns:



131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/article_json/import/google_doc/html/node_analyzer.rb', line 131

def type
  return :empty if empty?
  return :hr if hr?
  return :heading if heading?
  return :paragraph if paragraph?
  return :list if list?
  return :text_box if text_box?
  return :quote if quote?
  return :image if image?
  return :embed if embed?

  :unknown
end