Class: Pork::Search

Inherits:
Object
  • Object
show all
Defined in:
lib/pork_sandwich/search.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(query, options = {}) ⇒ Search

Returns a new instance of Search.



6
7
8
9
10
11
12
13
14
15
# File 'lib/pork_sandwich/search.rb', line 6

def initialize(query, options = {})
  @query = query
  @desired_count = options[:desired_count] #if nil, will pull as far back as the Search API allows
  @current_count = 0
  @since_id = options[:since_id]
  @from_user = options[:from_user] 
  @db_ids_created = []
  @collect_users = options[:collect_users]
  @pulls_per_hour = options[:pulls_per_hour]? options[:pulls_per_hour] : 1500
end

Instance Attribute Details

#current_countObject (readonly)

Returns the value of attribute current_count.



4
5
6
# File 'lib/pork_sandwich/search.rb', line 4

def current_count
  @current_count
end

#db_ids_createdObject (readonly)

Returns the value of attribute db_ids_created.



4
5
6
# File 'lib/pork_sandwich/search.rb', line 4

def db_ids_created
  @db_ids_created
end

#desired_countObject

Returns the value of attribute desired_count.



3
4
5
# File 'lib/pork_sandwich/search.rb', line 3

def desired_count
  @desired_count
end

#from_userObject

Returns the value of attribute from_user.



3
4
5
# File 'lib/pork_sandwich/search.rb', line 3

def from_user
  @from_user
end

#pulls_per_hourObject

Returns the value of attribute pulls_per_hour.



3
4
5
# File 'lib/pork_sandwich/search.rb', line 3

def pulls_per_hour
  @pulls_per_hour
end

#queryObject (readonly)

Returns the value of attribute query.



4
5
6
# File 'lib/pork_sandwich/search.rb', line 4

def query
  @query
end

#since_idObject

Returns the value of attribute since_id.



3
4
5
# File 'lib/pork_sandwich/search.rb', line 3

def since_id
  @since_id
end

Instance Method Details

#historical_pullObject



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# File 'lib/pork_sandwich/search.rb', line 17

def historical_pull
  @search_params = Twitter::Search.new(@query).per_page(100)
  @search_params.from(@from_user) if @from_user
  begin
    loop do
      time_at_start = Time.now
      if $PORK_LOG 
        $PORK_LOG.write("historical pull, query = #{@query}, max_id = #{@search_params.query[:max_id].to_s}")
      end
      @return_data = @search_params.dup.fetch
      if @return_data.error == "You have been rate limited. Enhance your calm."
        raise Pork::RateLimitExceeded
      end
      @tweets_pulled = @return_data.results
      @tweets_pulled.each do |tweet|
        tweet.status_id = tweet.id   
        if reached_desired_count? or reached_since_id?(tweet.status_id)
          break
        end
        @db_ids_created << $SAVER.save(tweet, &TWEET_SAVE).id
        # $CRAWLER.append(tweet.from_user) if @collect_users
        @current_count += 1
      end
      if reached_desired_count? or @search_params.query[:max_id] == @tweets_pulled.last.id or reached_since_id?(@tweets_pulled.last.id)
        break
      else
        @search_params.query[:max_id] = @tweets_pulled.last.id
      end
      manage_pull_rate(time_at_start)
    end
  rescue Twitter::Unavailable
    if $PORK_LOG
       $PORK_LOG.write("ERROR: Twitter unavailable, trying in 60")
    end
    sleep 60
    retry
  rescue Twitter::NotFound
    if $PORK_LOG
       $PORK_LOG.write("ERROR: Info target not found, trying to skip")
    end
    retry
  rescue Crack::ParseError
    if $PORK_LOG
       $PORK_LOG.write("Error: JSON Parsing error, trying to skip past problem tweet")
    end
    @search_params.query[:max_id] -= 1000 if @search_params.query[:max_id]
    manage_pull_rate
    retry
  rescue Errno::ETIMEDOUT
    if $PORK_LOG
       $PORK_LOG.write("ERROR: Puller timed out, retrying in 10")
    end
    sleep 10
    retry
  rescue Twitter::InformTwitter
    if $PORK_LOG
       $PORK_LOG.write("ERROR: Twitter internal error, retrying in 30")
    end
    sleep 30
    retry
  rescue Pork::RateLimitExceeded
   if $PORK_LOG
       $PORK_LOG.write("ERROR: Rate limit exceeded; holding off for a bit then trying again")
    end
    sleep 300
    reduce_pull_rate
    retry
  rescue Timeout::Error
    if $PORK_LOG
      $PORK_LOG.write("ERROR: Request Timed out. Retrying in 30")
    end
    sleep 30 
    retry
  end
  return true
end

#manage_pull_rate(time_at_start) ⇒ Object



102
103
104
105
106
107
108
109
110
111
# File 'lib/pork_sandwich/search.rb', line 102

def manage_pull_rate(time_at_start)
  desired_pause = 1.0 / (@pulls_per_hour / 60.0 / 60.0)
  pull_duration = Time.now - time_at_start
  if desired_pause - pull_duration > 0 
    actual_pause = desired_pause - pull_duration
  else
    actual_pause = 0
  end
  sleep actual_pause
end

#reached_desired_count?Boolean

Returns:

  • (Boolean)


94
95
96
97
98
99
100
# File 'lib/pork_sandwich/search.rb', line 94

def reached_desired_count?
  if @desired_count
    return @current_count >= @desired_count
  else
    return false
  end
end

#reached_since_id?(id) ⇒ Boolean

Returns:

  • (Boolean)


119
120
121
122
123
124
125
# File 'lib/pork_sandwich/search.rb', line 119

def reached_since_id?(id)
  if @since_id
    return id <= @since_id
  else
    return false
  end
end

#reduce_pull_rateObject



113
114
115
116
117
# File 'lib/pork_sandwich/search.rb', line 113

def reduce_pull_rate
  if @pulls_per_hour > 100
    @pulls_per_hour -= 100
  end
end