Class: RubyMaat::Dataset

Inherits:
Object
  • Object
show all
Defined in:
lib/ruby_maat/dataset.rb

Overview

Wrapper around Rover DataFrame to provide domain-specific operations This replaces Incanter datasets from the Clojure version

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(change_records = []) ⇒ Dataset

Returns a new instance of Dataset.



9
10
11
# File 'lib/ruby_maat/dataset.rb', line 9

def initialize(change_records = [])
  @data = build_dataframe(change_records)
end

Class Method Details

.from_changes(change_records) ⇒ Object



13
14
15
# File 'lib/ruby_maat/dataset.rb', line 13

def self.from_changes(change_records)
  new(change_records)
end

Instance Method Details

#authorsObject

Get all authors



49
50
51
52
53
# File 'lib/ruby_maat/dataset.rb', line 49

def authors
  return [] if @data.empty?

  @data[:author].uniq
end

#coupling_pairsObject

Get coupling pairs (combinations of entities that changed together)



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'lib/ruby_maat/dataset.rb', line 92

def coupling_pairs
  # Group by revision to find entities that changed together
  revision_entities = {}

  @data.to_a.each do |row|
    revision = row["revision"]
    entity = row["entity"]

    revision_entities[revision] ||= []
    revision_entities[revision] << entity unless revision_entities[revision].include?(entity)
  end

  pairs = []
  revision_entities.each_value do |entities|
    entities.combination(2) do |entity1, entity2|
      pairs << [entity1, entity2]
    end
  end

  pairs
end

#empty?Boolean

Returns:

  • (Boolean)


173
174
175
# File 'lib/ruby_maat/dataset.rb', line 173

def empty?
  @data.empty?
end

#entitiesObject

Get all entities (files)



42
43
44
45
46
# File 'lib/ruby_maat/dataset.rb', line 42

def entities
  return [] if @data.empty?

  @data[:entity].uniq
end

#filter_date_range(start_date, end_date) ⇒ Object

Filter by date range



145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# File 'lib/ruby_maat/dataset.rb', line 145

def filter_date_range(start_date, end_date)
  filtered_records = []
  @data.each_row do |row|
    next unless row[:date].between?(start_date, end_date)

    filtered_records << ChangeRecord.new(
      entity: row[:entity],
      author: row[:author],
      date: row[:date],
      revision: row[:revision],
      message: row[:message],
      loc_added: row[:loc_added],
      loc_deleted: row[:loc_deleted]
    )
  end

  Dataset.from_changes(filtered_records)
end

#filter_min_revisions(min_revs) ⇒ Object

Filter by minimum revisions



56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# File 'lib/ruby_maat/dataset.rb', line 56

def filter_min_revisions(min_revs)
  # Group by entity and count revisions
  entity_revision_counts = {}
  @data.to_a.each do |row|
    entity = row["entity"]
    revision = row["revision"]
    entity_revision_counts[entity] ||= Set.new
    entity_revision_counts[entity] << revision
  end

  # Find entities with enough revisions
  entities_to_keep = entity_revision_counts.select { |_, revisions| revisions.size >= min_revs }.keys

  # Filter data to only include those entities
  filtered_records = []
  @data.to_a.each do |row|
    filtered_records << row if entities_to_keep.include?(row["entity"])
  end

  # Build new dataset from filtered records
  change_records = filtered_records.map do |record|
    ChangeRecord.new(
      entity: record["entity"],
      author: record["author"],
      date: record["date"],
      revision: record["revision"],
      message: record["message"],
      loc_added: record["loc_added"],
      loc_deleted: record["loc_deleted"]
    )
  end

  Dataset.from_changes(change_records)
end

#group_by_author_sum_churnObject

Group by author and sum churn metrics



32
33
34
# File 'lib/ruby_maat/dataset.rb', line 32

def group_by_author_sum_churn
  @data.group(:author).sum(%i[loc_added loc_deleted])
end

#group_by_entity_count_authorsObject

Group by entity and count distinct authors



22
23
24
# File 'lib/ruby_maat/dataset.rb', line 22

def group_by_entity_count_authors
  @data.group(:entity).count(:author, name: "n_authors")
end

#group_by_entity_count_revisionsObject

Group by entity and count revisions



27
28
29
# File 'lib/ruby_maat/dataset.rb', line 27

def group_by_entity_count_revisions
  @data.group(:entity).count(:revision, name: "n_revs")
end

#group_by_entity_sum_churnObject

Group by entity and sum churn metrics



37
38
39
# File 'lib/ruby_maat/dataset.rb', line 37

def group_by_entity_sum_churn
  @data.group(:entity).sum(%i[loc_added loc_deleted])
end

#latest_date_by_entityObject

Get latest date for each entity (for age analysis)



165
166
167
# File 'lib/ruby_maat/dataset.rb', line 165

def latest_date_by_entity
  @data.group(:entity).max(:date)
end

#revision_count(entity) ⇒ Object

Get revision count for an entity



131
132
133
134
135
136
137
# File 'lib/ruby_maat/dataset.rb', line 131

def revision_count(entity)
  revisions = Set.new
  @data.to_a.each do |row|
    revisions << row["revision"] if row["entity"] == entity
  end
  revisions.size
end

#shared_revisions_count(entity1, entity2) ⇒ Object

Count shared revisions between entity pairs



115
116
117
118
119
120
121
122
123
124
125
126
127
128
# File 'lib/ruby_maat/dataset.rb', line 115

def shared_revisions_count(entity1, entity2)
  entity1_revs = Set.new
  entity2_revs = Set.new

  @data.to_a.each do |row|
    if row["entity"] == entity1
      entity1_revs << row["revision"]
    elsif row["entity"] == entity2
      entity2_revs << row["revision"]
    end
  end

  (entity1_revs & entity2_revs).size
end

#sizeObject



169
170
171
# File 'lib/ruby_maat/dataset.rb', line 169

def size
  @data.count
end

#to_dfObject



17
18
19
# File 'lib/ruby_maat/dataset.rb', line 17

def to_df
  @data
end

#unique_datesObject

Get unique dates



140
141
142
# File 'lib/ruby_maat/dataset.rb', line 140

def unique_dates
  @data[:date].uniq.sort
end