magda-io / magda

A federated, open-source data catalog for all your big data and small data
https://magda.io
Apache License 2.0
512 stars 93 forks source link

SPIKE - Generate keywords from PDF & Word #2056

Closed aneesha09 closed 5 years ago

aneesha09 commented 5 years ago

As a Agency Magda User I want to be able to generate keywords for my text So that I don't have to think and enter them

Acceptance Criteria:

  1. Use a file to generate keywords using the chosen keyword library
  2. Display keywords from the text in a text box and allow user to edit
  3. Integrate keyword tool into application
  4. Upload a file containing text to populate keywords
  5. Support file types PDF and Word

Dev Notes:

We know how to do keywords, we have to extract text from the file. Find a library that can do this, we don't want to write a PDF parser

Reference to keyword investigation - #2036

nahidakbar commented 5 years ago

Demo

<html>
<head>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/babel-standalone/6.26.0/babel.js"></script>
  <script src="https://d3js.org/d3.v5.min.js"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/pdf.js/2.2.2/pdf.js"></script>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/mammoth/1.4.7/mammoth.browser.min.js"></script>
  <script src="retext.js"></script>
  <script type="text/babel" src="dockeywords.js">

  </script>
  <style>
    #drop-target {
      position: absolute;
      top: 0px;
      left: 0px;
      right: 0px;
      bottom: 0px;
      background: linear-gradient(#dddddd, #eeeeee);
      padding: 1em;
      line-height: 100%;
      vertical-align: middle;
      border: 1em solid black;
      margin: 1em;
      font-family: monospace;
    }
    .dragging {
      border-color: red !important;
    }
  </style>
</head>
<body>
</body>
</html>

Drag and Drop

const dragTarget = d3.select(document.body)
  .append('div')
  .attr('id', "drop-target")
  .text('Drag and drop file here');

// lets load some files
dragTarget.on('drop', () => {
  console.log('File(s) dropped');
  dragTarget.classed('dragging', false);

  // Prevent default behavior (Prevent file from being opened)
  d3.event.preventDefault();

  for (var i = 0; i < d3.event.dataTransfer.files.length; i++) {
    loadDataFromFile(d3.event.dataTransfer.files[i]);
    break;
  }
});

dragTarget.on('dragover', () => {
  d3.event.preventDefault();
});

dragTarget.on('dragenter', () => {
  dragTarget.classed('dragging', true);
});

dragTarget.on('dragleave', () => {
  dragTarget.classed('dragging', false);
});

file data loading

async function loadDataFromFile(file) {
  let content = await readFile(file);

  let data;

  if (file.name.match(/docx?$/i)) {
    data = await readTextFromDoc(content);
  } else if (file.name.match(/pdf$/i)) {
    data = await readTextFromPdf(content);
  }

  const target = dragTarget.append('div');
  target.append('h2').text(file.name);
  target.append('pre').text(JSON.stringify(await RETEXT.getKeywords(data.text), null, 2));
}

function readFile(file) {
  return new Promise((resolve, reject) => {
    var fileReader = new FileReader();
    fileReader.onload = function() {
      resolve(this.result);
    };
    fileReader.readAsArrayBuffer(file);
  });
}

pdf

async function readTextFromPdf(content) {
  let pdf = await pdfjsLib.getDocument(new Uint8Array(content));

  const meta = await pdf.getMetadata();

  const {
    Title,
    Subject,
    Author
  } = meta.info;

  let text = [];

  for (let i = 1; i <= pdf.numPages; i++) {
    let page = await pdf.getPage(i);
    page = await page.getTextContent({
      normalizeWhitespace: true
    });
    page = page.items.map(txt => txt.str).join('\n');
    text.push(page);
  }

  return {
    meta: {
      Title,
      Subject,
      Author
    },
    text: text.join('\n\n')
  };
}

word

async function readTextFromDoc(content) {
  let data = await mammoth.extractRawText({
    arrayBuffer: content
  });
  return {
    text: data.value
  };
}

keywords extraction based on #2036

var retext = require('retext')
var keywords = require('retext-keywords')
var toString = require('nlcst-to-string')

module.exports.getKeywords = function(text, maximum = 10) {
  return new Promise(async (resolve, reject) => {
    retext()
      .use(keywords, {
        maximum
      })
      .process(text, done)

    function done(err, file) {
      if (err) throw err

      let keywords = [];
      let keyphrases = [];

      file.data.keywords.forEach(function(keyword) {
        keywords.push(toString(keyword.matches[0].node))
      })

      file.data.keyphrases.forEach(function(phrase) {
        keyphrases.push(phrase.matches[0].nodes.map(stringify).join(''))

        function stringify(value) {
          return toString(value)
        }
      })
      resolve({
        keywords,
        keyphrases
      });

    }

  });
}

pdf loading also shows metadata extraction