Module: RegexpPropertyValues::Updater

Defined in:
lib/regexp_property_values/updater.rb

Constant Summary collapse

BASE_URL =
'http://www.unicode.org/Public/'
UCD_FILES =
%w[
  Blocks.txt
  DerivedAge.txt
  DerivedCoreProperties.txt
  PropertyAliases.txt
  PropertyValueAliases.txt
  PropList.txt
  Scripts.txt
]
EMOJI_FILES =
%w[
  emoji-data.txt
]
TMP_DIR =
File.join(__dir__, 'tmp_ucd')

Class Method Summary collapse

Class Method Details

.callObject



26
27
28
29
30
31
32
33
# File 'lib/regexp_property_values/updater.rb', line 26

def call
  prepare_tmp_dir
  download_ucd_files
  write_values
  write_aliases
  remove_tmp_dir
  print_stats
end

.download_ucd_filesObject



40
41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/regexp_property_values/updater.rb', line 40

def download_ucd_files
  unicode_version = RbConfig::CONFIG.fetch('UNICODE_VERSION')
  emoji_version   = RbConfig::CONFIG.fetch('UNICODE_EMOJI_VERSION')
  puts 'This will load ucd and emoji data for the CURRENT RUBY '\
       "(#{unicode_version} / #{emoji_version}). Run this on the "\
       'latest Ruby version you want to support. Continue? [y/n]'
  return puts 'download skipped.' unless $stdin.gets =~ /^y/i

  Dir.chdir(TMP_DIR) do
    UCD_FILES.each   { |f| `wget #{BASE_URL}/#{unicode_version}/ucd/#{f}` }
    EMOJI_FILES.each { |f| `wget #{BASE_URL}/emoji/#{emoji_version}/#{f}` }
  end
end

.in_values?(string) ⇒ Boolean

Returns:

  • (Boolean)


116
117
118
# File 'lib/regexp_property_values/updater.rb', line 116

def in_values?(string)
  @values.any? { |value| value.casecmp?(string) }
end

.prepare_tmp_dirObject



35
36
37
38
# File 'lib/regexp_property_values/updater.rb', line 35

def prepare_tmp_dir
  FileUtils.rm_rf(TMP_DIR) if File.exist?(TMP_DIR)
  FileUtils.mkdir(TMP_DIR)
end


129
130
131
# File 'lib/regexp_property_values/updater.rb', line 129

def print_stats
  print "\nFetched #{@values.size} values and #{@aliases.size} aliases.\n\n"
end

.remove_tmp_dirObject



125
126
127
# File 'lib/regexp_property_values/updater.rb', line 125

def remove_tmp_dir
  FileUtils.rm_rf(TMP_DIR)
end

.scan(file, pattern) ⇒ Object



120
121
122
123
# File 'lib/regexp_property_values/updater.rb', line 120

def scan(file, pattern)
  path = File.join(TMP_DIR, file)
  File.read(path).scan(pattern) { yield(Regexp.last_match) }
end

.write_aliasesObject



93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# File 'lib/regexp_property_values/updater.rb', line 93

def write_aliases
  @aliases = Set.new

  scan('PropertyAliases.txt', /^(?<alias>\w+) *; (?<name>\w+)/) do |caps|
    if in_values?(caps[:name]) && !in_values?(caps[:alias])
      @aliases << [caps[:alias], caps[:name]]
    end
  end

  scan('PropertyValueAliases.txt',
    /^[gs]c ; (?<alias1>\w+) *; (?<name>\w+)(?: *; (?<alias2>\w+))?/) do |caps|
    if in_values?(caps[:name]) && !in_values?(caps[:alias1])
      @aliases << [caps[:alias1], caps[:name]]
    end
    if in_values?(caps[:name]) && caps[:alias2] && !in_values?(caps[:alias2])
      @aliases << [caps[:alias2], caps[:name]]
    end
  end

  File.write(RegexpPropertyValues::ALIASES_PATH,
             @aliases.sort.map { |pair| pair.join(';') }.join("\n"))
end

.write_valuesObject



54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# File 'lib/regexp_property_values/updater.rb', line 54

def write_values
  @values = Set.new

  # posix properties
  @values += %w[
    Alpha Blank Cntrl Digit Graph Lower Print
    Punct Space Upper XDigit Word Alnum ASCII
    XPosixPunct
  ]

  # special properties
  @values += %w[Any Assigned In_No_Block Unknown]

  # legacy properties
  @values += %w[Newline]

  regexp = /^[0-9a-fA-F]+(?:\.\.[0-9a-fA-F]+)? *; (?<prop_name>\w+) +# /
  %w[
    DerivedCoreProperties.txt
    PropList.txt
    Scripts.txt
    emoji-data.txt
  ].each { |file| scan(file, regexp) { |caps| @values << caps[:prop_name] } }

  scan('PropertyValueAliases.txt', /^gc ; \w+ *; (?<prop_name>\w+)/) do |caps|
    @values << caps[:prop_name]
  end

  scan('Blocks.txt', /^[\dA-F.]+ *; (?<block_name>[-\w ]+)/) do |caps|
    @values << 'In_' + caps[:block_name].gsub(/\W/, '_')
  end

  scan('DerivedAge.txt', /^[\dA-F.]+ *; (?<age_num>[\d.]+)/) do |caps|
    @values << 'Age=' + caps[:age_num]
  end

  File.write(RegexpPropertyValues::VALUES_PATH, @values.sort.join("\n"))
end