Closed aneesha09 closed 5 years ago
<html>
<head>
<script src="https://cdnjs.cloudflare.com/ajax/libs/babel-standalone/6.26.0/babel.js"></script>
<script src="https://d3js.org/d3.v5.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.2.2/pdf.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/mammoth/1.4.7/mammoth.browser.min.js"></script>
<script src="retext.js"></script>
<script type="text/babel" src="dockeywords.js">
</script>
<style>
#drop-target {
position: absolute;
top: 0px;
left: 0px;
right: 0px;
bottom: 0px;
background: linear-gradient(#dddddd, #eeeeee);
padding: 1em;
line-height: 100%;
vertical-align: middle;
border: 1em solid black;
margin: 1em;
font-family: monospace;
}
.dragging {
border-color: red !important;
}
</style>
</head>
<body>
</body>
</html>
const dragTarget = d3.select(document.body)
.append('div')
.attr('id', "drop-target")
.text('Drag and drop file here');
// lets load some files
dragTarget.on('drop', () => {
console.log('File(s) dropped');
dragTarget.classed('dragging', false);
// Prevent default behavior (Prevent file from being opened)
d3.event.preventDefault();
for (var i = 0; i < d3.event.dataTransfer.files.length; i++) {
loadDataFromFile(d3.event.dataTransfer.files[i]);
break;
}
});
dragTarget.on('dragover', () => {
d3.event.preventDefault();
});
dragTarget.on('dragenter', () => {
dragTarget.classed('dragging', true);
});
dragTarget.on('dragleave', () => {
dragTarget.classed('dragging', false);
});
async function loadDataFromFile(file) {
let content = await readFile(file);
let data;
if (file.name.match(/docx?$/i)) {
data = await readTextFromDoc(content);
} else if (file.name.match(/pdf$/i)) {
data = await readTextFromPdf(content);
}
const target = dragTarget.append('div');
target.append('h2').text(file.name);
target.append('pre').text(JSON.stringify(await RETEXT.getKeywords(data.text), null, 2));
}
function readFile(file) {
return new Promise((resolve, reject) => {
var fileReader = new FileReader();
fileReader.onload = function() {
resolve(this.result);
};
fileReader.readAsArrayBuffer(file);
});
}
async function readTextFromPdf(content) {
let pdf = await pdfjsLib.getDocument(new Uint8Array(content));
const meta = await pdf.getMetadata();
const {
Title,
Subject,
Author
} = meta.info;
let text = [];
for (let i = 1; i <= pdf.numPages; i++) {
let page = await pdf.getPage(i);
page = await page.getTextContent({
normalizeWhitespace: true
});
page = page.items.map(txt => txt.str).join('\n');
text.push(page);
}
return {
meta: {
Title,
Subject,
Author
},
text: text.join('\n\n')
};
}
async function readTextFromDoc(content) {
let data = await mammoth.extractRawText({
arrayBuffer: content
});
return {
text: data.value
};
}
var retext = require('retext')
var keywords = require('retext-keywords')
var toString = require('nlcst-to-string')
module.exports.getKeywords = function(text, maximum = 10) {
return new Promise(async (resolve, reject) => {
retext()
.use(keywords, {
maximum
})
.process(text, done)
function done(err, file) {
if (err) throw err
let keywords = [];
let keyphrases = [];
file.data.keywords.forEach(function(keyword) {
keywords.push(toString(keyword.matches[0].node))
})
file.data.keyphrases.forEach(function(phrase) {
keyphrases.push(phrase.matches[0].nodes.map(stringify).join(''))
function stringify(value) {
return toString(value)
}
})
resolve({
keywords,
keyphrases
});
}
});
}
pdf loading also shows metadata extraction
As a Agency Magda User I want to be able to generate keywords for my text So that I don't have to think and enter them
Acceptance Criteria:
Dev Notes:
We know how to do keywords, we have to extract text from the file. Find a library that can do this, we don't want to write a PDF parser
Reference to keyword investigation - #2036