Module: ZhSieve::Converter

Included in:
HTMLPage
Defined in:
lib/zhSieve/html2md.rb

Instance Method Summary collapse

Dynamic Method Handling

This class handles dynamic methods through the method_missing method

#method_missing(name, *args, &block) ⇒ Object

define custom node processor



120
121
122
123
124
# File 'lib/zhSieve/html2md.rb', line 120

def method_missing(name,*args,&block)
  self.class.send :define_method,"parse_#{name}" do |node,contents|
    block.call node,contents
  end
end

Instance Method Details

#answer_to_markdown(string_contents, question_id, answer_id) ⇒ Object

Raises:



22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# File 'lib/zhSieve/html2md.rb', line 22

def answer_to_markdown string_contents,question_id,answer_id
  raise NoContents unless string_contents!=nil
  doc = Nokogiri::HTML(string_contents,'UTF-8')
  answer_node = doc.search '[data-aid="'+answer_id.to_s+'"]'
  search_answer = Answer.new
  # search and set question infomations
  avatar_raw = answer_node.search '[class="zm-list-avatar avatar"]'
  author_raw = answer_node.search '[class="author-link"]'
  bio_raw = answer_node.search '[class="bio"]'
  content_raw = answer_node.search '[class="zm-editable-content clearfix"]'
  search_answer.question = doc.title.strip
  search_answer.link = "https://www.zhihu.com/people/#{question_id}#answer-#{answer_id}"
  search_answer.avatar= parse_element(avatar_raw.first)
  search_answer.bio = parse_element(bio_raw.first)
  search_answer.content = parse_element(content_raw.first)
  author_info = parse_element(author_raw.first).split(/[\[\]\(\)]/)
  search_answer.author = author_info[1]
  search_answer.author_link = "https://www.zhihu.com" + author_info[3]
  markdown_text = search_answer.format_markdown
end

#article_to_markdown(string_contents, article_id) ⇒ Object

Raises:



43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/zhSieve/html2md.rb', line 43

def article_to_markdown string_contents,article_id
  raise NoContents unless string_contents!=nil
  doc_hash = JSON.parse(string_contents)
  search_article = Article.new
  search_article.title = doc_hash["title"]
  search_article.title_image = "![](#{doc_hash["titleImage"]})"
  search_article.published_time = doc_hash['publishedTime']
  search_article.link = "https://zhuanlan.zhihu.com/p/#{article_id}"
  search_article.content = doc_hash["content"]
  search_article.author = doc_hash["author"]["name"]
  search_article.bio = doc_hash["author"]["bio"] || "No Bio!"
  search_article.author_link = doc_hash["author"]["profileUrl"]
  avatar_template = doc_hash["author"]["avatar"]["template"].gsub('{size}','s')
  avatar_id = doc_hash["author"]["avatar"]["id"]
  avatar_uri = avatar_template.gsub('{id}', "#{avatar_id}")
  search_article.avatar = "![](#{avatar_uri})"
  markdown_text = search_article.format_markdown
end

#debugObject



126
127
128
129
130
# File 'lib/zhSieve/html2md.rb', line 126

def debug
  puts '----------------------------------'
  puts yield
  puts '----------------------------------'
end

#parse_element(ele) ⇒ Object



62
63
64
65
66
67
68
69
70
71
72
# File 'lib/zhSieve/html2md.rb', line 62

def parse_element(ele)
  if ele.is_a? Nokogiri::XML::Text
    return "#{ele.text}\n"
  else
    if (children = ele.children).count > 0 
      return wrap_node(ele,children.map {|ele| parse_element(ele)}.join )
    else
      return wrap_node(ele,ele.text)
    end
  end
end

#people_to_markdown(string_contents) ⇒ Object

Raises:



15
16
17
18
19
20
# File 'lib/zhSieve/html2md.rb', line 15

def people_to_markdown string_contents
  raise NoContents unless string_contents!=nil 
  doc = Nokogiri::HTML(string_contents,'UTF-8')
  search_user = People.new
  # set people info
end

#to_markdown(string_contents) ⇒ Object

Raises:



9
10
11
12
13
# File 'lib/zhSieve/html2md.rb', line 9

def to_markdown string_contents
  raise NoContents unless string_contents!=nil 
  doc = Nokogiri::HTML(string_contents,'UTF-8')
  doc.children.map { |ele| parse_element(ele) }.join
end

#wrap_node(node, contents = nil) ⇒ Object

wrap node with markdown



75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# File 'lib/zhSieve/html2md.rb', line 75

def wrap_node(node,contents=nil)
  result = ''
  contents.strip! unless contents==nil
  # check if there is a custom parse exist
  if respond_to? "parse_#{node.name}"
    return self.send("parse_#{node.name}",node,contents)
  end
  # skip hidden node
  return '' if node['style'] and node['style'] =~ /display:\s*none/
  # default parse
  case node.name.downcase
  when 'i'
  when 'script'
  when 'style'
  when 'li'
    result << "*#{contents}\n"
  when 'blockquote'
    contents.split('\n').each do |part|
      result << ">#{contents}\n"
    end
  when 'p'
    result << "\n#{contents}\n"
  when 'strong'
    result << "**#{contents}**\n"
  when 'h1'
    result << "# #{contents}\n"
  when 'h2'
    result << "## #{contents}\n"
  when 'h3'
    result << "### #{contents}\n"
  when 'hr'
    result << "****\n"
  when 'br'
    result << "\n"
  when 'img'
    result << "![#{node['alt']}](#{node['src']})"
  when 'a'
    result << "[#{contents}](#{node['href']})"
  else
    result << contents unless contents == nil
  end
  result
end