Hypertopic / Porphyry

Corpus analyses confrontation
https://hypertopic.org/porphyry
GNU Affero General Public License v3.0
21 stars 165 forks source link

Another index structure could be used for co-occurrences computation #213

Open benel opened 4 years ago

benel commented 4 years ago

An unpublished paper by Fréderic Merle, Aurélien Bénel and Yann Barloy (written in 2013-2014) compared the efficiency of 3 (or even 4) index structures usable to speed up the multidimensional browsing algorithm. One of them appeared to be far more efficient than those that were tested in earlier and current versions of Porphyry.

Please note however that, as any index, building and updating it takes time.

benel commented 4 years ago

Co-occurrences computation appears to be negligible in comparison with data downloading:

Time Task
0,1s – 0, 2s getView(user)
1,3 s – 2,0 s getView(corpora, viewpoints)
1,3 s – 2,0 s getView(corpora)
0,4 s – 0,6 s getView(viewpoints)
0,001 s restructuring viewpoints
0,002 s restructuring items
0,019 s co-occurrences

Tested on the stained-glasses portfolio (1771 items, 608 topics). @garnier5

benel commented 4 years ago

Pour info, voici le code (indépendant de Porphyry mais qui s'en inspire grandement) que j'avais utilisé pour faire mes mesures :

const Hypertopic = require('hypertopic');
const USER = 'vitraux';
const SERVICES = [
  'http://argos2.hypertopic.org',
  'http://steatite.hypertopic.org'
];

let start = new Date().getTime(); 

let logWithTime = (x) => {
  let end = new Date().getTime(); 
  console.log(end - start, x);
  start = end;
};

let user = {};
let viewpoints = [];
let items = [];

const hypertopic = new Hypertopic(SERVICES);
hypertopic.getView(`/user/${USER}`)
  .then((x) => {
    user = x[USER];
    logWithTime(user);
    return user.viewpoint.map(y => `/viewpoint/${y.id}`)
      .concat(user.corpus.map(y => `/corpus/${y.id}`));
  })
  .then(hypertopic.getView)
  .then((x) => {
    logWithTime('GOT corpora and viewpoints');
    return x;
  })
  .then((data) => {
    for (let v of user.viewpoint) {
      let viewpoint = data[v.id];
      viewpoint.id = v.id;
      viewpoints.push(viewpoint);
    }
    logWithTime({viewpoints: viewpoints.length});
    return data;
  })
  .then((data) => {
    for (let corpus of user.corpus) {
      for (let itemId in data[corpus.id]) {
        if (!['id','name','user'].includes(itemId)) {
          let item = data[corpus.id][itemId];
          if (!item.name || !item.name.length) {
          } else {
            item.id = itemId;
            item.corpus = corpus.id;
            items.push(item);
          }
        }
      }
    }
    logWithTime({items: items.length});
    return data;
  })
  .then((x) => {
    let selectedItems = items; //worst case
    let topicsItems = new Map();
    for (let e of selectedItems) {
      for (let t of _getRecursiveItemTopics(e)) {
        push(topicsItems, t, e.id);
      }
    }
    logWithTime({topics: topicsItems.size});
  });

  function _getTopic(id) {
    for (let v of viewpoints) {
      if (v[id]) return v[id];
    }
    return null;
  }

  function push(map, topicId, itemId) {
    let old = map.get(topicId);
    if (old) {
      map.set(topicId, old.add(itemId));
    } else {
      map.set(topicId, new Set([itemId]));
    }
  }

  function _getTopicPath(topicId) {
    let topic = _getTopic(topicId);
    let path = (topic && topic.broader)? _getTopicPath(topic.broader[0].id) : [];
    path.push(topicId);
    return path;
  }

  function _getItemTopicsPaths(item) {
    return (item.topic||[]).map(t => _getTopicPath(t.id));
  }

  function _getRecursiveItemTopics(item) {
    return Array.prototype.concat(..._getItemTopicsPaths(item));
  }
garnier5 commented 4 years ago

Tested on the stained-glasses portfolio (always the same item, 14 topics).

10 000 items

Time Task
0,05s – 0,1s getView(user)
7,0 s – 8,0 s getView(corpora, viewpoints)
0,000 s restructuring viewpoints
0,009 s restructuring items
0,090 s co-occurrences

50 000 items

Time Task
0,05s – 0,1s getView(user)
30s – 35s getView(corpora, viewpoints)
0,002 s restructuring viewpoints
0,030 s restructuring items
0,376 s co-occurrences

100 000 items

Time Task
0,1s – 0,2s getView(user)
65s – 75s getView(corpora, viewpoints)
0,007 s restructuring viewpoints
0,050 s restructuring items
0,815 s co-occurrences