Class: Vcs2Json::Git

Inherits:
Object
  • Object
show all
Includes:
Logging
Defined in:
lib/vcs2json/git.rb

Constant Summary collapse

FIELD_SEP =

Generate separators between fields and commits

Digest::SHA256.hexdigest Time.new.to_s + "field_sep"
META_DATA =
"%H#{FIELD_SEP}"\
"%an#{FIELD_SEP}"\
"%ae#{FIELD_SEP}"\
"%ad#{FIELD_SEP}"\
"%cn#{FIELD_SEP}"\
"%ce#{FIELD_SEP}"\
"%cd#{FIELD_SEP}"\
"%B"

Instance Attribute Summary collapse

Instance Method Summary collapse

Methods included from Logging

configure_logger_for, #logger, logger_for, set_level, set_location

Constructor Details

#initialize(opts) ⇒ Git

Returns a new instance of Git.



21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# File 'lib/vcs2json/git.rb', line 21

def initialize(opts)
  # case id must be set before setting ignore
  # as the id is used to lookup the list of
  # files to ignore in the ignorefile
  self.case_id = opts[:case_id]
  self.ignore = opts[:ignore]
  self.before = opts[:before]
  self.after = opts[:after]
  self.number = opts[:number]
  self.fine_grained = opts[:fine_grained]

  # Set logger level
  Logging.set_location(opts[:logger_location])
  Logging.set_level(opts[:logger_level])
  SrcML.ignore_comments = opts[:ignore_comments]
  SrcML.ignore_whitespace = opts[:ignore_whitespace]
  SrcML.residuals = opts[:residuals]

  # Check that SrcML is available if fine grained is turned on
  if self.fine_grained
    begin
      Open3.capture3("srcml --version")
    rescue Errno::ENOENT
      $stderr.puts "SrcML is required for fine grained change history extraction, please install from www.srcml.com"
      $stderr.puts "Defaulting to file level"
      self.fine_grained = false
    end
  end
end

Instance Attribute Details

#case_idObject

Returns the value of attribute case_id.



7
8
9
# File 'lib/vcs2json/git.rb', line 7

def case_id
  @case_id
end

#fine_grainedObject

Returns the value of attribute fine_grained.



7
8
9
# File 'lib/vcs2json/git.rb', line 7

def fine_grained
  @fine_grained
end

#ignoreObject

Returns the value of attribute ignore.



8
9
10
# File 'lib/vcs2json/git.rb', line 8

def ignore
  @ignore
end

#numberObject

Returns the value of attribute number.



7
8
9
# File 'lib/vcs2json/git.rb', line 7

def number
  @number
end

Instance Method Details

#afterObject



63
64
65
# File 'lib/vcs2json/git.rb', line 63

def after
  @after.nil? ? '' : "--after=\"#{@after}\""
end

#after=(after) ⇒ Object



51
52
53
54
55
56
57
58
59
60
61
# File 'lib/vcs2json/git.rb', line 51

def after=(after)
  if !after.nil?
    begin
      Date.parse(after)
      @after = after
    rescue
      STDERR.puts "Invalid date --after=#{after}. Ignoring option."
      @after = nil
    end
  end
end

#beforeObject



79
80
81
# File 'lib/vcs2json/git.rb', line 79

def before
  @before.nil? ? '' : "--before=\"#{@before}\""
end

#before=(before) ⇒ Object



67
68
69
70
71
72
73
74
75
76
77
# File 'lib/vcs2json/git.rb', line 67

def before=(before)
  if !before.nil?
    begin
      Date.parse(before)
      @before = before
    rescue
      STDERR.puts "Invalid date --before=#{before}. Ignoring option."
      @before = nil
    end
  end
end

#parseObject



83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
# File 'lib/vcs2json/git.rb', line 83

def parse

  # keeps track of number of commits successfully parsed
  commit_counter = 0

  # keeps track of empty commits
  empty_commits = []

  ########################## 
  # GET LIST OF COMMIT IDS #
  ##########################

  # getting the list of revision ids is cheap, so we get some extra in case we are unable to parse the required amount in the first 'n' commits
  commit_ids = `git rev-list HEAD #{self.before} #{self.after} -n #{self.number*10} --no-merges`.split

  ############################
  # ITERATE OVER EACH COMMIT #
  ############################

  commit_ids.each do |id|
    logger.debug "Parsing commit: #{id}"
    # get the changed files
    changed_files = `git log --pretty=format:'' --name-status #{id} -n 1`.split("\n")
                      .map {|line| line.split(/(^[AMD])\s+/).delete_if {|e| e.empty?}}

    # remove ignored files
    changed_files.reject! {|file| 
      if self.ignore.include?(file[1])
       logger.debug "[IGNOREDEBUG] Ignored #{file[1]} in commit #{id}" 
       true
      else
        false
      end
    }

    # add files changed info
    if !changed_files.empty?

      ##################
      # FETCH METADATA #
      ################## 

      raw_commit = `git log --pretty=format:'#{META_DATA}' #{id} -n 1`
      commit = ''

      ##################
      # CLEAN RAW DATA #
      ##################

      begin
        # try encoding to utf8
        commit = raw_commit.encode('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
        # need to expliceitely check if the encoding is valid for ruby <= 2.0
        # utf8 -> utf8 will not do anything even with invalid bytes
        # http://stackoverflow.com/questions/24036821/ruby-2-0-0-stringmatch-argumenterror-invalid-byte-sequence-in-utf-8
        if !commit.valid_encoding?
          # encode to utf16 first and then back to utf8
          commit.encode!("UTF-16be", invalid: :replace, undef: :replace, :replace=>'')
          commit.encode!('UTF-8', 'binary', invalid: :replace, undef: :replace, replace: '')
        end
      rescue ArgumentError
        raise EncodingError.new, "Unable to encode input as UTF-8"
      end

      ##############################
      # CONSTRUCT OUTPUT HASH/JSON #
      ############################## 

      output_hash = Hash.new
      fields = commit.split(FIELD_SEP)
      sha = fields[0].delete("\n") #remove astray newlines
      output_hash[:sha]            = sha
      output_hash[:name]           = fields[1]
      output_hash[:email]          = fields[2]
      output_hash[:date]           = Time.parse fields[3]
      output_hash[:committer_name] = fields[4]
      output_hash[:committer_email]= fields[5]
      output_hash[:committer_date] = Time.parse fields[6]
      output_hash[:message]        = fields[7]
      output_hash[:changes] = []

      #######################################
      # PARSE FILES FOR FINEGRAINED CHANGES #
      ####################################### 
      
      # print progress


      changed_files.each_with_index do |(status,file_name),index|
        STDERR.print "Parsing file #{index+1} Of #{changed_files.size} in commit #{commit_counter+1} of #{self.number}                  \r"
        if ([status,file_name].empty? || status.nil? || file_name.nil? || status.empty? || file_name.empty?)
            # ignoring commit
        else
          # add finer grained change info
          if self.fine_grained
            begin
              # new file, all methods are new, no need to calculate diff 
              if status == 'A'
                SrcML.methods(file_name,revision: id).keys.each {|m| output_hash[:changes] << m}
              # calculate diffs    
              else
                SrcML.changed_methods_git(file_name,id).each {|m| output_hash[:changes] << m}
              end
            rescue SrcML::UnsupportedLanguageError, SrcML::ParseError
              output_hash[:changes] << file_name
            end
          else
            output_hash[:changes] << file_name
          end
        end
      end # changes_files.each

      # Only add commits where at least one change was detected
      if !output_hash[:changes].empty?
        ###########################
        # PRINT COMMIT TO $stdout #
        ###########################

        $stdout.puts output_hash.to_json

        # increase counter for number of commits successfully parsed
        commit_counter += 1

        ########################################
        # CHECK IF REQUESTED AMOUNT IS REACHED #
        ######################################## 

        if commit_counter == self.number
          break # out of loop
        end
      else # no changes detected in commit
        empty_commits << id
      end
    else # no files in commit
      empty_commits << id
    end
  end 

  # we may still lack commits after exhaustive search, notify user
  if commit_counter < self.number
    STDERR.puts "Asked for #{self.number} commits, only found #{commit_counter} non-empty commits in the last #{self.number*2} commits"
  end
  # print ids of empty commits to stderr
  if !empty_commits.empty?
    STDERR.puts "EMPTY COMMITS"
    STDERR.puts empty_commits
  end
end