Class: Grabepg::GrabTvsou

Inherits:
Object
  • Object
show all
Includes:
Grabepg
Defined in:
lib/grabepg/grab_tvsou.rb

Constant Summary collapse

ChannelTypeMap =
{"yg_ys_li"=>"央视","yg_ws_li"=>"卫视","yg_hw_li"=>"海外","yg_df_li"=>"地方"}

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(grabtype, proxy_list) ⇒ GrabTvsou

type 从mobie还是网站接口抓取数据



31
32
33
34
35
36
37
# File 'lib/grabepg/grab_tvsou.rb', line 31

def initialize(grabtype,proxy_list)
  @home_page = get_url(grabtype)
  @proxy_list = proxy_list
  @grabbase = GrabBase.new
  @channels = {}
  @site="http://m.tvsou.com"
end

Instance Attribute Details

#channelsObject (readonly)

频道存储



19
20
21
# File 'lib/grabepg/grab_tvsou.rb', line 19

def channels
  @channels
end

#default_min_intervalObject (readonly)

俩个节目间的最小间隔时间



25
26
27
# File 'lib/grabepg/grab_tvsou.rb', line 25

def default_min_interval
  @default_min_interval
end

#grabbaseObject (readonly)

Returns the value of attribute grabbase.



16
17
18
# File 'lib/grabepg/grab_tvsou.rb', line 16

def grabbase
  @grabbase
end

#home_pageObject (readonly)

首页



11
12
13
# File 'lib/grabepg/grab_tvsou.rb', line 11

def home_page
  @home_page
end

#proxy_listObject (readonly)

代理列表



14
15
16
# File 'lib/grabepg/grab_tvsou.rb', line 14

def proxy_list
  @proxy_list
end

#schedulesObject (readonly)

时间表存储



22
23
24
# File 'lib/grabepg/grab_tvsou.rb', line 22

def schedules
  @schedules
end

Instance Method Details

#dispose_channel_page(url, channel_type) ⇒ Object

获取频道列表 url是获取频道列表的首页 地方需要调用此函数



118
119
120
# File 'lib/grabepg/grab_tvsou.rb', line 118

def dispose_channel_page(url,channel_type)

end

#dispose_home_pageObject

对首页进行处理获取部分频道的URL和嘻嘻



77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/grabepg/grab_tvsou.rb', line 77

def dispose_home_page

  get_channellist = lambda { |li,type|
    channellist = {}
    li.css('a').each do |a|
      channellist.merge!({a.content=>{url:a.get_attribute("href"),type:type}}) unless channellist.has_key?(a.content)
    end
    channellist
  }


  doc = @grabbase.get_doc_with_proxy(@proxy_list,@home_page)
  begin
  doc.css("li").each do |li|
    case ChannelTypeMap[li.get_attribute("class")]
      when "央视"
        @channels.merge!(get_channellist.call(li,"CCTV"))
      when "卫视"
        @channels.merge!(get_channellist.call(li,"WTV"))
      when "海外"

      when "地方"

    end
  end
  @error_num=0
  rescue
    unless @error_num
      @error_num = 0
    end
    @error_num+=1
    raise err.to_s  if @error_num==5
    dispose_home_page
  end
  return @channels
end

#dispose_href_schedule_data(href, start_time, use_time) ⇒ Object

获取频道时间表URL



147
148
149
150
151
152
153
154
155
156
157
158
159
160
# File 'lib/grabepg/grab_tvsou.rb', line 147

def dispose_href_schedule_data(href,start_time,use_time)
 hrefs=href.split("&programDT=")
 _hrefs=hrefs[1].split("&")
 ret = []
 get_data(start_time,use_time).each do |time|
   _hrefs[0]=time[:time]
   url = hrefs[0]+"&programDT=" + time[:time]
   1.upto(_hrefs.length-1).each do |i|
     url += "&"+_hrefs[i]
   end
   ret<<{url:url,time:time[:time],date:time[:date]}
 end
 ret
end

#dispose_schedule_page(url, start_time, use_time) ⇒ Object

根据URL解析时间表页面



163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
# File 'lib/grabepg/grab_tvsou.rb', line 163

def dispose_schedule_page(url,start_time,use_time)
  url = url
  urls = url.split("?")
  begin
  doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
  @error_num = 0
  _url = doc.css("div[class='week']")[0].css('a')[0].get_attribute("href")
  rescue => err
    unless @error_num
      @error_num = 0
    end
    @error_num+=1
    raise err.to_s  if @error_num==5
    dispose_schedule_page(url,start_time,use_time)
  end
  _url = urls[0]+_url
  urls = dispose_href_schedule_data(_url,start_time,use_time)
  ret = {}
  last_time = -5
  last_schedule = {}
  urls.each do |url|
    p "Grab url: #{url}"
    if url
      doc = @grabbase.get_doc_with_proxy(@proxy_list,url[:url])
      schedules = []
      div = doc.css('div[class="time"]')[0]
      if div
      div.css("li[class='gray']").each do |schedule|
        begin
          _dispose = schedule.content
          _dispose_show =schedule.css("span")[0].text
          time = _dispose.gsub(_dispose_show,"")
          href =schedule.css('a')[schedule.css('a').count-1].get_attribute("href")
          _url = @site+"/" + href if schedule.css('a')[0]
          schedules << {time:time,schedule_name:_dispose_show.delete(" 剧情"),url:_url}
          now = time.gsub(":","").to_i
          if((now-last_time)<5)
            schedules.delete(last_schedule)
          end
          last_schedule = {time:time,schedule_name:_dispose_show.gsub(" 剧情",""),url:_url}
          last_time = now
        rescue => err
          p "Schedule: #{schedule}"
        end
      end
      ret.merge!({url[:date]=>schedules})
      else
        p "Error In this url: #{url} couldn't get doc.css('div[class=time]')[0]"
      end
   end
  end
  return ret
end

#dispose_show_info(url) ⇒ Object

解析节目详情页面



222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
# File 'lib/grabepg/grab_tvsou.rb', line 222

def dispose_show_info(url)
  doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
  if doc.nil?
    unless @error_num
      @error_num = 0
    end
    @error_num+=1
    raise err.to_s  if @error_num==5
    dispose_show_info(url)
  end
  begin
  show_name = doc.css('div[class="tv_info_top"]')[0].content
  _doc=doc.css("div[class='tv_info']")
  img_url = _doc.css("img")[0].get_attribute("src").gsub(" ","")
  show_info = _doc.css("p")[0].content.gsub("[全文]","")
  @error_num = 0
  {show_name:show_name,img_url:img_url,show_info:show_info}
  rescue => err
    unless @error_num
      @error_num = 0
    end
    @error_num+=1
    raise err.to_s  if @error_num==5
    dispose_show_info(url)
  end
end

#get_channel_logo(_url, channel_type, no_dis = false) ⇒ Object

获取频道图标地址 url 手机表的URL值 channel_type 频道类型 no_dis 直接使用URL 不处理



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# File 'lib/grabepg/grab_tvsou.rb', line 126

def (_url,channel_type,no_dis=false)
  if no_dis
    url = _url
  else
    tvs = _url.split("TVid=")
    tvid = tvs[1].split("&")[0]
    channelids = _url.split("Channelid=")
    channelid = channelids[1].split("&")[0]
    if channel_type=="CCTV"
      url = "http://epg.tvsou.com/programys/TV_#{tvid}/Channel_#{channelid}/W1.htm"
    elsif channel_type=="WTV"
      url = "http://epg.tvsou.com/programws/TV_#{tvid}/Channel_#{channelid}/W1.htm"
    end
  end
  doc = @grabbase.get_doc_with_proxy(@proxy_list,url)
  logo_network_path=doc.css("div[id='epg_m1']").css("img")[0].get_attribute("src")
  return logo_network_path
end

#get_data(start_time, use_time) ⇒ Object

获取时间 start_time 时间起始点 use_time 天数



65
66
67
68
69
70
71
72
73
# File 'lib/grabepg/grab_tvsou.rb', line 65

def get_data(start_time,use_time)
  time = Time.now+start_time*24*60*60
  ret = []
  use_time.times.each do |i|
    _time = time + i*24*60*60
    ret << get_data_year_month_day(_time)
  end
  ret
end

#get_data_year_month_day(time) ⇒ Object



49
50
51
52
53
54
55
56
57
58
59
60
# File 'lib/grabepg/grab_tvsou.rb', line 49

def get_data_year_month_day(time)

  month=time.month.to_s
  if month.length<2
    month="0"+month
  end
  day = time.day.to_s
  if day.length<2
  	day = "0"+day
  end
  return {time:"#{time.year}-#{time.month}-#{day}",date:"#{@grabbase.conversion_what_day(time.wday)}(#{month}-#{day})"}
end

#get_proxy_listObject



39
40
41
# File 'lib/grabepg/grab_tvsou.rb', line 39

def get_proxy_list
  @proxy_list
end

#get_url(type) ⇒ Object

获取从tvsou的什么网站上获取 type: mobile,webpage



45
46
47
# File 'lib/grabepg/grab_tvsou.rb', line 45

def get_url(type)
  return "http://m.tvsou.com/index.asp" if type.eql?("mobile")
end