Module: RegexpPropertyValues::Updater

Defined in:
lib/regexp_property_values/updater.rb

Constant Summary collapse

BASE_URL =
'https://www.unicode.org/Public/UCD/latest/ucd'
UCD_FILES =
%w[
  Blocks.txt
  DerivedAge.txt
  DerivedCoreProperties.txt
  PropertyAliases.txt
  PropertyValueAliases.txt
  PropList.txt
  Scripts.txt
]
EMOJI_FILES =
%w[
  emoji-data.txt
]
TMP_DIR =
File.join(__dir__, 'tmp_ucd')

Class Method Summary collapse

Class Method Details

.call(ucd_path: nil, emoji_path: nil) ⇒ Object



26
27
28
29
30
31
32
33
# File 'lib/regexp_property_values/updater.rb', line 26

def call(ucd_path: nil, emoji_path: nil)
  prepare_tmp_dir
  download_ucd_files(ucd_path: ucd_path, emoji_path: emoji_path)
  write_values
  write_aliases
  remove_tmp_dir
  print_stats
end

.download_ucd_files(ucd_path: nil, emoji_path: nil) ⇒ Object



40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/regexp_property_values/updater.rb', line 40

def download_ucd_files(ucd_path: nil, emoji_path: nil)
  puts 'This will try to load the latest UCD data. Continue? [y/n]'
  return puts 'download skipped.' unless $stdin.gets =~ /^y/i

  ucd_path   ||= ENV['RPV_UCD_PATH']   || BASE_URL
  emoji_path ||= ENV['RPV_EMOJI_PATH'] || "#{BASE_URL}/emoji/"

  Dir.chdir(TMP_DIR) do
    UCD_FILES.each   { |f| `wget #{ucd_path}/#{f}` }
    EMOJI_FILES.each { |f| `wget #{emoji_path}/#{f}` }
  end
end

.in_values?(string) ⇒ Boolean

Returns:

  • (Boolean)


115
116
117
# File 'lib/regexp_property_values/updater.rb', line 115

def in_values?(string)
  @values.any? { |value| value.casecmp?(string) }
end

.prepare_tmp_dirObject



35
36
37
38
# File 'lib/regexp_property_values/updater.rb', line 35

def prepare_tmp_dir
  FileUtils.rm_rf(TMP_DIR) if File.exist?(TMP_DIR)
  FileUtils.mkdir(TMP_DIR)
end


128
129
130
# File 'lib/regexp_property_values/updater.rb', line 128

def print_stats
  print "\nFetched #{@values.size} values and #{@aliases.size} aliases.\n\n"
end

.remove_tmp_dirObject



124
125
126
# File 'lib/regexp_property_values/updater.rb', line 124

def remove_tmp_dir
  FileUtils.rm_rf(TMP_DIR)
end

.scan(file, pattern) ⇒ Object



119
120
121
122
# File 'lib/regexp_property_values/updater.rb', line 119

def scan(file, pattern)
  path = File.join(TMP_DIR, file)
  File.read(path).scan(pattern) { yield(Regexp.last_match) }
end

.write_aliasesObject



92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# File 'lib/regexp_property_values/updater.rb', line 92

def write_aliases
  @aliases = Set.new

  scan('PropertyAliases.txt', /^(?<alias>\w+) *; (?<name>\w+)/) do |caps|
    if in_values?(caps[:name]) && !in_values?(caps[:alias])
      @aliases << [caps[:alias], caps[:name]]
    end
  end

  scan('PropertyValueAliases.txt',
    /^[gs]c ; (?<alias1>\w+) *; (?<name>\w+)(?: *; (?<alias2>\w+))?/) do |caps|
    if in_values?(caps[:name]) && !in_values?(caps[:alias1])
      @aliases << [caps[:alias1], caps[:name]]
    end
    if in_values?(caps[:name]) && caps[:alias2] && !in_values?(caps[:alias2])
      @aliases << [caps[:alias2], caps[:name]]
    end
  end

  File.write(RegexpPropertyValues::ALIASES_PATH,
             @aliases.sort.map { |pair| pair.join(';') }.join("\n"))
end

.write_valuesObject



53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# File 'lib/regexp_property_values/updater.rb', line 53

def write_values
  @values = Set.new

  # posix properties
  @values += %w[
    Alpha Blank Cntrl Digit Graph Lower Print
    Punct Space Upper XDigit Word Alnum ASCII
    XPosixPunct
  ]

  # special properties
  @values += %w[Any Assigned In_No_Block Unknown]

  # legacy properties
  @values += %w[Newline]

  regexp = /^[0-9a-fA-F]+(?:\.\.[0-9a-fA-F]+)? *; (?<prop_name>\w+) +# /
  %w[
    DerivedCoreProperties.txt
    PropList.txt
    Scripts.txt
    emoji-data.txt
  ].each { |file| scan(file, regexp) { |caps| @values << caps[:prop_name] } }

  scan('PropertyValueAliases.txt', /^gc ; \w+ *; (?<prop_name>\w+)/) do |caps|
    @values << caps[:prop_name]
  end

  scan('Blocks.txt', /^[\dA-F.]+ *; (?<block_name>[-\w ]+)/) do |caps|
    @values << 'In_' + caps[:block_name].gsub(/\W/, '_')
  end

  scan('DerivedAge.txt', /^[\dA-F.]+ *; (?<age_num>[\d.]+)/) do |caps|
    @values << 'Age=' + caps[:age_num]
  end

  File.write(RegexpPropertyValues::VALUES_PATH, @values.sort.join("\n"))
end