Open bkiahstroud opened 3 months ago
Scripts used to copy XML files from S3 onto the server:
require 'aws-sdk-s3'
ENV['AWS_S3_BUCKET'] = "ams.americanarchive.org"
ENV['AWS_ACCESS_KEY_ID'] = #TODO
ENV['AWS_SECRET_ACCESS_KEY'] = #TODO
bucket ||= Aws::S3::Resource.new(region: 'us-east-1').bucket(ENV.fetch('AWS_S3_BUCKET'))
batch_size = 710000
batch_placeholder = 2509077
1.times do
start_at = batch_placeholder
end_at = batch_placeholder + batch_size
total = %x{wc -l tmp/imports/full_list.log}
batch_name = "AMS1Importer_#{start_at}-#{end_at}"
file_path = "tmp/imports/#{batch_name}"
FileUtils.mkdir_p(file_path)
File.open("tmp/imports/full_list.log") do |f|
f.each.with_index do |row, i|
break if i > end_at
next if i < (start_at - 1)
puts row
row.strip!
obj = bucket.object(row)
obj.download_file(File.join(file_path, File.basename(row)))
# Download to directory
puts "#{i} of #{total}"
end
end
b = Bulkrax::Importer.create(
name: batch_name,
admin_set_id: "admin_set/default",
user_id: 1,
frequency: "PT0S",
parser_klass: "PbcoreXmlParser",
parser_fields:
{
"record_element" => "pbcoreDescriptionDocument",
"import_type" => "single",
"visibility" => "restricted",
"rights_statement" => "",
"override_rights_statement" => "0",
"file_style" => "Specify a Path on the Server",
"import_file_path"=> file_path,
"replace_files"=>true
}
)
batch_placeholder = end_at
end
ruby
require 'aws-sdk-s3'
ENV['AWS_S3_BUCKET'] = "ams.americanarchive.org"
ENV['AWS_ACCESS_KEY_ID'] = "#TODO"
ENV['AWS_SECRET_ACCESS_KEY'] = "#TODO"
bucket ||= Aws::S3::Resource.new(region: 'us-east-1').bucket(ENV.fetch('AWS_S3_BUCKET'))
logger = Logger.new('tmp/imports/full_list.log')
logger.formatter = proc { |severity, datetime, progname, msg|
"#{msg}\n"
}
i = 0
bucket.objects(prefix: "ams2").each do |obj|
logger.info(obj.key)
puts "#{i += 1}"
end
bash
require 'fileutils'
Dir.glob('AMS1Importer*').each do |dir|
Dir.chdir(dir)
Dir.glob("*xml\n").each do |f|
FileUtils.mv(f, f.strip)
puts f
end
Dir.chdir('..')
end
Waiting for Drew to be back in office as on PTO to finalize review/merge.
Story
Before Bulkrax imports began, Rob ran a script to copy the XML files containing records from S3 onto the server. This script should be cleaned up, have any secrets removed, and added to the repo. This will enable copying of additional files from S3 in the future.
The scripts can be found in this private SoftServ Slack channel.