scientist-softserv / ams

Archival Management System to support the American Archive of Public Broadcasting
GNU General Public License v3.0
1 stars 0 forks source link

Add script to copy XML files from S3 to repo #136

Open bkiahstroud opened 3 months ago

bkiahstroud commented 3 months ago

Story

Before Bulkrax imports began, Rob ran a script to copy the XML files containing records from S3 onto the server. This script should be cleaned up, have any secrets removed, and added to the repo. This will enable copying of additional files from S3 in the future.

The scripts can be found in this private SoftServ Slack channel.

aprilrieger commented 2 weeks ago

Scripts used to copy XML files from S3 onto the server:

require 'aws-sdk-s3'
ENV['AWS_S3_BUCKET'] = "ams.americanarchive.org"
ENV['AWS_ACCESS_KEY_ID'] = #TODO
ENV['AWS_SECRET_ACCESS_KEY'] = #TODO
bucket ||= Aws::S3::Resource.new(region: 'us-east-1').bucket(ENV.fetch('AWS_S3_BUCKET'))

batch_size = 710000
batch_placeholder = 2509077

1.times do
  start_at = batch_placeholder
  end_at =   batch_placeholder + batch_size
  total = %x{wc -l tmp/imports/full_list.log}
  batch_name = "AMS1Importer_#{start_at}-#{end_at}"
  file_path = "tmp/imports/#{batch_name}"
  FileUtils.mkdir_p(file_path)
  File.open("tmp/imports/full_list.log") do |f|
    f.each.with_index do |row, i|
      break if i > end_at
      next if i < (start_at - 1)
      puts row
      row.strip!
      obj = bucket.object(row)
      obj.download_file(File.join(file_path, File.basename(row)))

      # Download to directory
      puts "#{i} of #{total}"
    end
  end
  b = Bulkrax::Importer.create(
    name: batch_name,
    admin_set_id: "admin_set/default",
    user_id: 1,
    frequency: "PT0S",
    parser_klass: "PbcoreXmlParser",
    parser_fields:
    {
      "record_element" => "pbcoreDescriptionDocument",
      "import_type" => "single",
      "visibility" => "restricted",
      "rights_statement" => "",
      "override_rights_statement" => "0",
      "file_style" => "Specify a Path on the Server",
      "import_file_path"=> file_path,
      "replace_files"=>true
     }
  )
  batch_placeholder = end_at
end
ruby
require 'aws-sdk-s3'
ENV['AWS_S3_BUCKET'] = "ams.americanarchive.org"
ENV['AWS_ACCESS_KEY_ID'] = "#TODO"
ENV['AWS_SECRET_ACCESS_KEY'] = "#TODO"
bucket ||= Aws::S3::Resource.new(region: 'us-east-1').bucket(ENV.fetch('AWS_S3_BUCKET'))
logger = Logger.new('tmp/imports/full_list.log')
logger.formatter = proc { |severity, datetime, progname, msg|
  "#{msg}\n"
}

i = 0
bucket.objects(prefix: "ams2").each do |obj|
  logger.info(obj.key)
  puts "#{i += 1}"
end

Clean up files with line endings

bash
require 'fileutils'
Dir.glob('AMS1Importer*').each do |dir|
  Dir.chdir(dir)
  Dir.glob("*xml\n").each do |f|
    FileUtils.mv(f, f.strip)
    puts f
  end
  Dir.chdir('..')
end
aprilrieger commented 1 week ago

Waiting for Drew to be back in office as on PTO to finalize review/merge.