Class: ArchivesspaceChecker

Inherits:
Sinatra::Base
  • Object
show all
Defined in:
archivesspace_checker.rb

Overview

EadChecker is a Sinatra App

Defined Under Namespace

Classes: RuleKeyStr

Constant Summary

CONFIG =

site-specific configuration

if File.exists?(File.join('config', 'config.yml'))
  YAML.safe_load(IO.read(File.join('config', 'config.yml'))) || {}
else
  {}
end
PHASE_OPTS =

Schematron phases supported by included schematron

[
  {name: "Manual", value: "'manual'", checked: "checked"},
  {name: "Automatic", value: "'automated'"},
  {name: "Everything", value: "'#ALL'"}
]
OUTPUT_OPTS =

Output options

{
  'xml' => {name: 'xml', value: 'xml', mime: 'application/xml', :checked => "checked"},
  'csv' => {name: 'csv', value: 'csv', mime: 'text/csv'}
}
SCHEMATRON =

The schematron used by the application to check XML

IO.read(CONFIG['schematron'] ||
File.join('schematron', 'archivesspace_checker_sch.xml'))
CHECKER =

Default Schematronium instance used for checking files

Schematronium.new(SCHEMATRON)
STRON_REP =

Representation of Schematronium structure used for generating help

stron_xml.xpath('//rule').reduce({}) do |result, rule|
  key = RuleKeyStr.new(rule.xpath('./comment()').text.strip)
  key.manual = rule.ancestors('pattern').first['id'].match(/-manual\Z/)
  result[key] = rule.xpath('./assert').map(&:text).map(&:strip)
  result
end.sort_by {|k,v| k}.to_h

Helper Methods (collapse)

Routes (collapse)

Instance Method Details

- (Object) check_file(f, phase)

Runs schematron over a particular file

If phase argument is provided, constructs checker restricted to that phase.

Parameters:

  • f (File)

    a file to check

  • phase (String)

    schematron phase to be run



100
101
102
103
104
105
106
107
108
109
110
111
112
# File 'archivesspace_checker.rb', line 100

def check_file(f, phase)
  # If phase is other than default, bespoke checker
  checker = (phase == "'#ALL'") ? CHECKER : Schematronium.new(SCHEMATRON, phase)

  s_xml = Saxon.XML(f)
  xml = checker.check(s_xml.to_s)
  xml.remove_namespaces!
  xml = xml.xpath("//failed-assert") + xml.xpath("//successful-report")
  xml.each do |el|
    el["line-number"] = s_xml.xpath(el.attr("location")).get_line_number
  end
  xml
end

- (nil) csv_output(xml, orig_name, out)

Produce CSV output method

Parameters:

  • xml (Nokogiri::XML::NodeSet)

    results from schematron processing

  • orig_name (String)

    name of EAD as uploaded

  • out (IO)

    stream to write output to

Returns:

  • (nil)


145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
# File 'archivesspace_checker.rb', line 145

def csv_output(xml, orig_name, out)
  opts = {encoding: 'utf-8'}
  out << CSV.generate_line(%w|filename total_errors|, opts)

  out << CSV.generate_line( %w|filename total_errors|, opts)
  out << CSV.generate_line( [orig_name, xml.count], opts)
  out << CSV.generate_line( [], opts)
  out << CSV.generate_line( %w|type location line-number message|, opts)

  xml.each do |el|
    out << CSV.generate_line( [el.name,
                               el['location'],
                               el['line-number'],
                               el.xpath('.//text').first.content], opts)
  end
  return nil
end

- (Object) GET /

Index route, entry point. This is the tool's UI



168
169
170
# File 'archivesspace_checker.rb', line 168

get "/" do
  haml :index
end

- (Object) GET /possible-errors

Help page which lists errors that the tool can check for



215
216
217
# File 'archivesspace_checker.rb', line 215

get "/possible-errors" do
  haml :possible_errors
end

- (Object) GET /schematron.xml

The schematron file



220
221
222
223
# File 'archivesspace_checker.rb', line 220

get "/schematron.xml" do
  headers "Content-Type" => "application/xml; charset=utf8"
  SCHEMATRON
end

- (Object) POST /result.:filetype

Form submissions post to this route, the response is information on errors

in XML or CSV

Output is streamed, due to issues with using Nokogiri to build large XML response sets.



179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# File 'archivesspace_checker.rb', line 179

post "/result.:filetype" do
  up = params['eadFile']

  # If Saxon throws, set headers and just return the response
  begin
    result_of_check = check_file(up[:tempfile], params[:phase])
  rescue Java::NetSfSaxonS9api::SaxonApiException => e
    headers "Content-Type" => "#{OUTPUT_OPTS['xml'][:mime]}; charset=utf8"
    return <<-ERROR.lines.map(&:lstrip).join
      <?xml version="1.0" encoding="UTF-8"?>
      <fatal-error>
        Possible causes include parse error, DOCTYPE declaration, or entity expansion in the EAD file you're checking. DOCTYPE declarations and entity resolution are disallowed for security reasons.

        Original error message:

        #{ e.message.split(/;/).map(&:strip).last(3).join("\n") }
      </fatal-error>
    ERROR
  end
  # Stream because otherwise large XML output will blow up the heap
  headers "Content-Type" => "#{OUTPUT_OPTS[params[:filetype]][:mime]}; charset=utf8"
  stream do |out|
    case params[:filetype]
    when 'xml'
      xml_output(result_of_check,
                 up[:filename],
                 out)
    when 'csv'
      csv_output(result_of_check,
                 up[:filename],
                 out)
    end
  end
end

- (nil) xml_output(xml, orig_name, out)

Stream XML as generated to out

Parameters:

  • xml (Nokogiri::XML::NodeSet)

    results from schematron processing

  • orig_name (String)

    name of EAD as uploaded

  • out (IO)

    stream to write output to

Returns:

  • (nil)


120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
# File 'archivesspace_checker.rb', line 120

def xml_output(xml, orig_name, out)
  counts = xml.group_by {|el| el.element_children.first.text.strip.gsub(/\s+/, ' ')}.map {|k,v| [k,v.count]}.to_h

  out << "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>\n"
  out << "<file file_name='#{orig_name}' total_errors='#{xml.count}'>\n"
  out << "<error_counts>\n"
  counts.each do |k,v|
    out << "<message count='#{v}'>#{k}</message>\n"
  end
  out << "</error_counts>\n"
  out << "<errors>\n"
  xml.each do |n|
    out << n.to_xml
  end
  out << "</errors>\n"
  out << "</file>"

  nil # Return value is not for use
end