Class: JavlibrarySpider

Inherits:
Object
  • Object
show all
Defined in:
lib/javlibrary.rb

Constant Summary collapse

JAVLIBRARY_URL =
[ "jav11b.com", "javlibrary.com" ]

Instance Method Summary collapse

Constructor Details

#initialize(database_name = 'javlibrary', user = 'root', pwd = 'default') ⇒ JavlibrarySpider

Returns a new instance of JavlibrarySpider.



13
14
15
16
17
18
19
20
21
# File 'lib/javlibrary.rb', line 13

def initialize(database_name = 'javlibrary', user = 'root', pwd = 'default')
    # Define client variable
    @database = database_name
    @username = user
    @password = pwd

    # Define default Javlibrary url
    @url = JAVLIBRARY_URL[0]
end

Instance Method Details

#actor_hashObject



129
130
131
132
133
134
135
136
137
138
# File 'lib/javlibrary.rb', line 129

def actor_hash
    client = client
    actor_hash = Hash.new
    client.query("SELECT * FROM actor").each do |item|
        actor_hash["#{item['actor_name']}"] = item['actor_id']
    end
    client.close

    actor_hash
end

#author_page_num(nokogiri_doc) ⇒ Object



181
182
183
184
185
186
187
# File 'lib/javlibrary.rb', line 181

def author_page_num(nokogiri_doc)
    last_page = 1
    nokogiri_doc.search('//div[@class="page_selector"]/a[@class="page last"]').each do |row|
        last_page = row['href'].split("=")[-1].to_i
    end
    last_page
end

#clientObject



23
24
25
26
27
28
# File 'lib/javlibrary.rb', line 23

def client
    client = Mysql2::Client.new(:host => "127.0.0.1",
                                :username => @user,
                                :password => @password,
                                :database => @database)
end

#download_all_video_labelsObject



287
288
289
290
291
292
293
294
295
296
# File 'lib/javlibrary.rb', line 287

def download_all_video_labels
    thread_pool =[]
    'A'.upto('Z').each do |alphabet|
        thread_temp = Thread.new{
            select_actor(alphabet)
        }
        thread_pool << thread_temp
    end
    thread_pool.map(&:join)
end

#download_all_videosObject



93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# File 'lib/javlibrary.rb', line 93

def download_all_videos
    client = client
    result = client.query("SELECT video_num, video_label FROM label WHERE video_download=0")
    client.close

    video_array = Array.new
    result.each do |item|
        video_array << item
    end

    video_array = video_array.each_slice(5000).to_a

    actor_hash = Javlibrary::actor_hash
    genre_hash = Javlibrary::genre_hash
    thread_pool = Array::new

    video_array.each do |group|
        # Create a download thread
        thread_temp = Thread.new {
            client = client
            group.each do |item|
                begin
                    video_info_insert(client, item['video_num'], item['video_label'],
                        actor_hash, genre_hash)
                rescue
                    next
                end
            end
            client.close
            GC.start
        }
        thread_pool << thread_temp
    end
    thread_pool.map(&:join)
end

#download_video_label(actor_id) ⇒ Object



232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# File 'lib/javlibrary.rb', line 232

def download_video_label(actor_id)
    firsturl = "http://www.#{@url}/ja/vl_star.php?s=#{actor_id}"
    baseurl = "http://www.#{@url}/ja/vl_star.php?&mode=&s=#{actor_id}&page="

    begin
        response = RestClient.get firsturl
    rescue
        retry
    end

    doc = Nokogiri::HTML(response.body)
    last_page = 1
    doc.search('//div[@class="page_selector"]/a[@class="page last"]').each do |row|
        last_page = row['href'].split("=")[-1].to_i
    end

    result = []
    1.upto(last_page) do |page|
        tempurl = baseurl + page.to_s
        begin
            response = RestClient.get tempurl
        rescue
            retry
        end

        Nokogiri::HTML(response.body).search('//div[@class="video"]/a').each do |row|
            # Data:
            # Video_label: row['href'].split("=")[-1]
            # Video_title: row['title']
            # client.query("INSERT INTO label (lable) VALUES ('#{row['href'].split("=")[-1]}')")
            result << row['href'].split("=")[-1]
        end
    end

    client = client
    result.each do |e|
        begin
            client.query("INSERT INTO label (video_label, video_download) VALUES ('#{e}', '0')")
        rescue
            next
        end
    end
    client.close
end

#downloader(identifer) ⇒ Object



30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# File 'lib/javlibrary.rb', line 30

def downloader(identifer)
    baseurl = "http://www.#{@url}/cn/?v=#{identifer}"
    response = Mechanize.new
    response.user_agent = Mechanize::AGENT_ALIASES.values[rand(21)]
    begin
        response.get baseurl
    rescue Timeout::Error
        retry
    rescue
        return
    end

    doc = Nokogiri::HTML(response.page.body)

    video_title, details, video_genres, video_jacket_img = String.new, Array.new, String.new, String.new

    video_title = doc.search('div[@id="video_title"]/h3/a').children.text
    doc.search('//div[@id="video_info"]/div[@class="item"]/table/tr/td[@class="text"]').map do |row|
        details << row.children.text
    end

    doc.search('//div[@id="video_genres"]/table/tr/td[@class="text"]/span[@class="genre"]/a').each do |row|
        video_genres << row.children.text << " "
    end

    doc.search('//img[@id="video_jacket_img"]').each do |row|
        video_jacket_img = row['src']
    end

    # return data format: title$id$date$director$maker$label$cast$genres$img_url
    "#{video_title}$#{details[0]}$#{details[1]}$#{details[2]}$#{details[3]}$#{details[4]}$#{details[-1]}$#{video_genres}$#{video_jacket_img}"
    #result = Hash.new
    #result["title"] = video_title; result["id"] = details[0]; result["date"] = details[1]
    #result["director"] = details[2]; result["maker"] = details[3]; result["label"] = details[4]
    #result["cast"] = details[-1]; result["genres"] = video_genres; result["img_url"] = video_jacket_img
end

#genre_hashObject



140
141
142
143
144
145
146
147
148
149
# File 'lib/javlibrary.rb', line 140

def genre_hash
    client = client
    category_hash = Hash.new
    client.query("SELECT * FROM category").each do |item|
        category_hash["#{item['category_name']}"] = item['category_id']
    end
    client.close

    category_hash
end

#genresObject



151
152
153
154
155
156
157
158
159
160
161
162
163
# File 'lib/javlibrary.rb', line 151

def genres
    response = Mechanize.new; genres = Array.new
    begin
        response.get "http://www.#{@url}/cn/genres.php"
    rescue
        retry
    end

    Nokogiri::HTML(response.page.body).search('//div[@class="genreitem"]/a').each do |row|
        genres << row.children.text
    end
    genres.uniq
end

#genres_insertObject Also known as: download_all_genres



165
166
167
168
169
170
171
172
173
174
175
176
177
# File 'lib/javlibrary.rb', line 165

def genres_insert
    client = client
    genres = genres()
    genres.each do |e|
        begin
            client.query("INSERT INTO category (category_name) VALUES ('#{e}')")
        rescue
            next
        end
    end

    client.close
end

#get_all_actorObject Also known as: download_all_actors



189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# File 'lib/javlibrary.rb', line 189

def get_all_actor
    firsturl = "http://www.#{@url}/cn/star_list.php?prefix="

    client = client
    'A'.upto('Z') do |alphabet|
        tempurl = firsturl + alphabet
        begin
            response = RestClient.get tempurl
        rescue
            retry
        end

        doc = Nokogiri::HTML(response.body)
        last_page = author_page_num(doc)

        1.upto(last_page) do |page_num|
            temp_page_url = tempurl + "&page=#{page_num.to_s}"
            begin
                response_page = RestClient.get temp_page_url
            rescue
                retry
            end

            doc_page = Nokogiri::HTML(response_page.body)
            doc_page.search('//div[@class="starbox"]/div[@class="searchitem"]/a').each do |row|
                # row.text Actor.name
                # row['href'].split("=")[-1] Actor.label
                name = row.text; label = row['href'].split("=")[-1]
                begin
                    client.query("INSERT INTO actor (actor_name, actor_label, type)
                        VALUES ('#{name}', '#{label}', '#{alphabet}')")
                rescue
                    next
                end
            end
        end
    end

    client.close
end

#select_actor(type) ⇒ Object



277
278
279
280
281
282
283
284
285
# File 'lib/javlibrary.rb', line 277

def select_actor(type)
    client = client
    result = client.query("SELECT actor_label FROM actor WHERE type='#{type}'")
    client.close

    result.each do |e|
        download_video_label(e["actor_label"])
    end
end

#video_info_insert(client, index, identifer, actor_hash, genres_hash) ⇒ Object



67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/javlibrary.rb', line 67

def video_info_insert(client, index, identifer, actor_hash, genres_hash)
    title, id, date, director, maker, label, cast_tmp, genres_tmp, img_url = downloader(identifer).split('$')
    cast = cast_tmp.split.reject(&:empty?)
    genres = genres_tmp.split.reject(&:empty?)
    begin
        client.query("INSERT INTO video (video_id,video_name,license,url,director,label,date,maker)
        VALUES (#{index},'#{title}','#{id}','#{img_url}','#{director}','#{label}','#{date}','#{maker}')")
    rescue
        return
    end
    cast.each do |a|
        a_tmp = actor_hash[a]
        next if a_tmp == nil
        client.query("INSERT INTO v2a (v2a_fk_video,v2a_fk_actor) VALUES(#{index}, #{a_tmp.to_i})")
    end

    genres.each do |g|
        g_tmp = genres_hash[g]
        next if g_tmp == nil
        client.query("INSERT INTO v2c (v2c_fk_video,v2c_fk_category) VALUES(#{index}, #{g_tmp.to_i})")
    end

    client.query("UPDATE label SET video_download=1 WHERE video_num=#{index}")
    client.close
end