Class: ExtractMetadata

Inherits:
Object
  • Object
show all
Defined in:
lib/extractmetadata.rb

Instance Method Summary collapse

Constructor Details

#initialize(file, input_dir, output_dir) ⇒ ExtractMetadata

Returns a new instance of ExtractMetadata.



5
6
7
8
9
10
11
12
13
# File 'lib/extractmetadata.rb', line 5

def initialize(file, input_dir, output_dir)
  @path = file
  @input_dir = input_dir
  @output_dir = output_dir
	@allowed_extensions = [
    'pdf', 'doc', 'docbook', 'docx', 'txt', 'rtf', 'md', 'csv', 'xls', 'xlsx', 
    'jpg', 'jpeg', 'png', 'gif', 'svg'
	]
end

Instance Method Details

#extractObject

Extract metadata



16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# File 'lib/extractmetadata.rb', line 16

def extract
  outhash = Hash.new
  
  # Get relative path
  @rel_path = get_rel_path
  outhash[:rel_path] = @rel_path
  outhash[:folders] = get_folders

  # Get formatted name and file type
  outhash[:formatted_name] = get_formatted_name
  outhash[:filetype] = get_file_type

  # Extract file metadata, merge. and return
  begin
 if (@allowed_extensions.include? outhash[:filetype])
      outhash.merge!()
    else
      puts "skipping ." + outhash[:filetype] + " file" 
    end
  rescue
  end
  return outhash
end

#extract_file_metadataObject

Extract PDF metadata



67
68
69
70
71
72
73
74
75
76
77
78
# File 'lib/extractmetadata.rb', line 67

def 
   = Hash.new
  [:author] = Docsplit.extract_author(@path)
  [:creator] =  Docsplit.extract_creator(@path)
  [:producer] = Docsplit.extract_producer(@path)
  [:title] = Docsplit.extract_title(@path)
  [:subject] = Docsplit.extract_subject(@path)
  [:date] = Docsplit.extract_date(@path)
  [:keywords] = Docsplit.extract_keywords(@path)
  [:length] = Docsplit.extract_length(@path)
  return 
end

#get_file_typeObject

Get file type



62
63
64
# File 'lib/extractmetadata.rb', line 62

def get_file_type
  @rel_path.split(".").last
end

#get_foldersObject

Split relative path and get array of directories



41
42
43
44
45
46
47
48
49
# File 'lib/extractmetadata.rb', line 41

def get_folders
  folders = @rel_path.split("/")

  # Remove file and empty items
  folders.delete(folders.last)
  folders.delete("")

  return folders
end

#get_formatted_nameObject

Get a formatted file name



57
58
59
# File 'lib/extractmetadata.rb', line 57

def get_formatted_name
  @rel_path.split(".").first.gsub("_", " ").gsub("/", "")
end

#get_rel_pathObject

Get the relative path



52
53
54
# File 'lib/extractmetadata.rb', line 52

def get_rel_path
  @path.gsub(@input_dir, "")
end