musedata / musedata-website

Website frontend for musedata
https://www.musedata.org
0 stars 0 forks source link

Add system to download the Google spreadsheet index of Musedata PDFs #4

Closed craigsapp closed 4 years ago

craigsapp commented 4 years ago

In June, Google spreadsheets became difficult to download directly to a web browser due to CORS restrictions.

A solution is to use Google Apps Scripts to create a URL that is not limited by CORS. To do this for a Google spreadsheet, got to Tools → Script editor...

Screen Shot 2020-09-17 at 6 20 46 PM

After creating and saving the following script, you have to "publish" the script: Go to the menu item Publish → Deploy as web app...:

Screen Shot 2020-09-17 at 6 22 20 PM

Then a window appears:

Screen Shot 2020-09-17 at 6 24 19 PM

Every time you update the script, you must create a new "Project version". Also allow for "Anyone, even anonymous" to access the script. Then press Update, and a new window appears:

Screen Shot 2020-09-17 at 6 26 12 PM

Make note of the URL (which will remain the same between different versions) and then click on OK.

The URL to the data is:

https://script.google.com/macros/s/AKfycbyIBWnniSSUHEDr10yRdMVx72sQDXXntyQ6bd0az2JpumBulp4/exec

This links to a JSON version of the spreadsheet, with each entry in an array being a data line in the spreadsheet table, such as the first two lines generate these two entries in the JSON data:

    {
        "LEVEL_1": "Corelli",
        "LEVEL_2": "Sonate da chiesa, op. 1",
        "LEVEL_3": "Sonata da chiesa in F major, op. 1, no. 1",
        "SHORTCUT": "corelli-op1-no01",
        "DATA_TYPE": "Page files",
        "DATA_URL": "https://raw.githubusercontent.com/musedata/corelli/master/op1/pages-score/corelli-op1n01.iff",
        "URL": "http://corelli.ccarh.org",
        "EDITOR": "Frances Bennion"
    },
    {
        "LEVEL_1": "Corelli",
        "LEVEL_2": "Sonate da chiesa, op. 1",
        "LEVEL_3": "Sonata da chiesa in E minor, op. 1, no. 2",
        "SHORTCUT": "corelli-op1-no02",
        "DATA_TYPE": "Page files",
        "DATA_URL": "https://raw.githubusercontent.com/musedata/corelli/master/op1/pages-score/corelli-op1n02.iff",
        "URL": "http://corelli.ccarh.org",
        "EDITOR": "Frances Bennion"
    },

There are other formats available for different environments and applications, which are accessed by giving an additional parameter at the end of the URL (&format=xxx):

Raw TSV version:
https://script.google.com/macros/s/AKfycbyIBWnniSSUHEDr10yRdMVx72sQDXXntyQ6bd0az2JpumBulp4/exec?format=raw

Cleaned TSV version (no explanatory text at start of file):

https://script.google.com/macros/s/AKfycbyIBWnniSSUHEDr10yRdMVx72sQDXXntyQ6bd0az2JpumBulp4/exec?format=tsv

ATON version: https://script.google.com/macros/s/AKfycbyIBWnniSSUHEDr10yRdMVx72sQDXXntyQ6bd0az2JpumBulp4/exec?format=aton This is a lower-complexity syntax that can be converted into JSON.

Humdrum version: https://script.google.com/macros/s/AKfycbyIBWnniSSUHEDr10yRdMVx72sQDXXntyQ6bd0az2JpumBulp4/exec?format=humdrum This is a spreadsheet like format that can be manipulated with tools from the Humdrum Toolkit.

Using this system avoids dependence on server-side infrastructure, and allows non-technical persons to control the website index directly from the spreadsheet.

craigsapp commented 4 years ago

Implemented and used with commit https://github.com/musedata/musedata.github.io/commit/54fe47a1b9d7be89efbd7447a9dfc4a54f873262

Here is the Google Apps Script used to extract data from the spreadsheet:

//
// CGI parameters that the script understands:
//
// format :: output format for the spreadsheet.  Values can be:
//      json :: output as JSON data (the default format).
//      raw  :: plain text TSV output of the header
//      tsv  :: similar to raw, but remove any text above the header line.
//      aton :: output spreadsheet in the ATON format (https://aton.sapp.org)
//      humdrum :: output spreadsheet in the Humdrum format.
// heading :: Text of an expected heading value for data columns.  This is used
//      to skip over extra text before the data column headings.
// label :: This is the name of the row object in ATON data output.  The default is "ENTRY".
//

//////////////////////////////
//
// doGet -- this is the function that returns a GET request for the web app URL.
//

function doGet(event) {
  // The targetHeaderField is a column heading that the spreadsheet table starts
  // at.  This allows for explanatory text above the table to be ignored.  The
  // heading text is a sample heading that is on the header line.  Any field
  // can be used as long as it is not on a line by itself or in a cell by itself
  // in the free-form text before the heading line.
  let targetHeaderField = "LEVEL_1";
  if (event && event.paramameter && event.parameter.heading) {
    targetHeaderField = event.parameter.heading;
  }
  try {
    let sid = "1U4Z-NeU2FGk_qNq6BwXeSVN-p8AnZsWbRHGo5fk3f3A";
    let gid = "0";
    let url = "https://docs.google.com/spreadsheets/d/"+sid+"/export?gid="+gid+"&format=tsv";
    const value = UrlFetchApp.fetch(url);
    let raw = value.getContentText();
    let format = "json";
    if (event && event.parameter && event.parameter.format) {
      let f = event.parameter.format;
      if (f.match(/aton/i)) {
        format = "aton";
      } else if (f.match(/tsv/i)) {
        format = "tsv";
      } else if (f.match(/hum/i)) {
        format = "humdrum";
      } else if (f.match(/raw/i)) {
        format = "raw";
      }
    }
    if (format === "raw") {
      return ContentService.createTextOutput(raw);
    } else if (format === "tsv") {
      return ContentService.createTextOutput(extractTsv(raw, targetHeaderField));
    } else if (format === "humdrum") {
      return ContentService.createTextOutput(tsvToHumdrum(raw, targetHeaderField));
    }
    let obj = spreadsheetToObject(raw, -1, "LEVEL_1", true);
    if (format === "aton") {
      var label = "ENTRY";
      if (event && event.parameter && event.parameter.label) {
        label = event.parameter.label;
      }
      var aton = spreadsheetArrayToAton(obj, label);
      return ContentService.createTextOutput(aton);
    } else {
       let json = JSON.stringify(obj, null, "\t");
       return ContentService.createTextOutput(json)
              .setMimeType(ContentService.MimeType.JSON);
    }  
  } catch (err) {
    let output = JSON.stringify({
      message: "Failed to extract speadsheet data: " + err.message, 
      stack: err.stack, 
      error: err}, null, "\t");
    return ContentService.createTextOutput(output);
  }
}

//////////////////////////////
//
// getHeaderIndex -- Given a target column header text, return the first
//     line that has that text in a TSV line.
//

function getHeaderIndex(lines, targetField) {
  for (let i=0; i<lines.length; i++) {
     let data = lines[i].split(/\t/);
     for (let j=0; j<data.length; j++) {
       let value = data[j].trim();
       if (value === targetField) {
           return i;
       }
     }
  }
  return 0;
}

//////////////////////////////
//
// extractTsv -- return the raw TSV data, but remove any header material
//

function extractTsv(text, headerField) {
  let lines = text.match(/[^\r\n]+/g);
  let headerIndex = getHeaderIndex(lines, headerField);
  let output = "";
  for (let i=headerIndex; i<lines.length; i++) {
    output += lines[i] + "\n";
  }
  return output;
}

//////////////////////////////
//
// spreadsheetToObject -- Convert TSV form of spreadsheet into Javascript object.
//

function spreadsheetToObject(text, headerIndex, headerField, suppressEmpty) {
  if (!text) {
    return [];
  }
  if (!headerIndex) {
     headerIndex = 0;
  } 
  suppressEmpty = !!suppressEmpty;

  let lines = text.match(/[^\r\n]+/g);
  if (headerField) {
      headerIndex = getHeaderIndex(lines, headerField);
  }
  let mapping = {};
  let data = lines[headerIndex].split(/\t/);
  for (let i=0; i<data.length; i++) {
       if (data[i].match(/^\s*$/)) {
           continue;
       }
      mapping[i] = data[i].trim();
  }
  var output = [];
  for (let i=headerIndex+1; i<lines.length; i++) {
     if (lines[i].match(/^\s*$/)) {
        continue;
     }    
      data = lines[i].split(/\t/);
      var item = {};
      for (let j=0; j<data.length; j++) {
        if (suppressEmpty && data[j].match(/^\s*$/)) {
          continue;
        }
        if (!mapping[j] || mapping[j].match(/^\s*$/)) {
          continue;
        }
         item[mapping[j]] = data[j].trim();
      }
      output.push(item);
  }
  return output;
}

///////////////////////////////
//
// spreadsheetArrayToAton -- convert a spreadsheet in JSON format to ATON format.
//     Input has to be an array.
//

function spreadsheetArrayToAton(obj, label) {
  if (!label) {
    label = "ENTRY";
  }
  let output = "";
  for (let i=0; i<obj.length; i++) {
    output += "@@BEGIN: " + label + "\n";
    for (property in obj[i]) {
      output += "@" + property + ": " + obj[i][property] + "\n";
    }
    output += "@@END: " + label + "\n\n";
  }
  return output;
}

//////////////////////////////
//
// getHeadings -- return an array of the text on the heading row.
//     It is assumed that there are no empty columns.  Spaces are not
//     allowed in the headings and most non-alphanumeric characters.
//

function getHeadings(text) {
  let fields = text.split(/\t+/);
  let output = [];
  for (let i=0; i<fields.length; i++) {
    let value = fields[i].trim().replace(/\s+/g, "_").replace(/[^A-Za-z0-9_#-]/g, "");
    output.push(value);
  }
  return output;
}

//////////////////////////////
//
// tsvToHumdrum -- convert a spreadsheet into the humdrum format, with optional
//    text on lines before the table.
//

function tsvToHumdrum(text, headerField) {
  if (!text) {
    return "";
  }
  let lines = text.match(/[^\r\n]+/g);
  let headerIndex = getHeaderIndex(lines, headerField);
  let output = "";
  for (let i=0; i<headerIndex; i++) {
    if (lines[i].match(/^!!/)) {
      output += lines[i].replace(/\s+$/, "") + "\n";
    } else {
      output += "!! " + lines[i].replace(/\s+$/, "") + "\n";
    }
  }
  let headings = getHeadings(lines[headerIndex]);

  // print exclusive interpretation line:
  for (let i=0; i<headings.length; i++) {
    if (headings[i].match(/^\*\*/)) {
      output += headings[i];
    } else if (headings[i].match(/^\*/)) {
      output += "*" + headings[i];
    } else {
      output += "**" + headings[i];
    }
    if (i < headings.length - 1) {
      output += "\t";
    }
  }
  output += "\n";

  // Print individual cells of table as Humdrum tokens
  // (not checking for leading ! or *).  It is assumed that
  // the row fields are either constant or that they are 
  // controlled with spine manipulators embedded in the data.
  for (let i=headerIndex + 1; i<lines.length; i++) {
    if (lines[i].match(/^\s*$/)) {
      continue;
    }
    let fields = lines[i].split(/\t/);
    for (let j=0; j<fields.length; j++) {
      if (j >= fields.length) {
        output += ".";
      }
      if (fields[j].match(/^\s*$/)) {
        output += ".";
      } else {
        output += fields[j];
      }
      if (j < fields.length - 1) {
        output += "\t";
      }
    }
    output += "\n";
  }

  // print terminator line:
  for (let i=0; i<headings.length; i++) {
    output += "*-";
    if (i < headings.length - 1) {
      output += "\t";
    }
  }
  output += "\n";

  return output;
}

The script extracts the text of the URL of the spreadsheet rather than access the spreadsheet app directly, which would also be possible.