TheProjecter / simal

Automatically exported from code.google.com/p/simal
0 stars 0 forks source link

BlankNodes can results in duplicate entries #107

Closed GoogleCodeExporter closed 9 years ago

GoogleCodeExporter commented 9 years ago
Although in the DOAP file the names of maintainers and developers are
listed once, in SIMAL's web page they appear more than once:

http://data.oss-watch.ac.uk:8080/?wicket:interface=:4:::: 

---------------------------------------------

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Simal Project and Community Registry Framework</title>
<link rel="stylesheet" type="text/css"
href="resources/uk.ac.osswatch.simal.wicket.UserApplication/default.css" />
</head>
<body>
<div id="header">
  <h1>Simal Project and Community Registry Framework</h1>
  <a href="./" wicket:id="homePageLink">Simal</a>
  <a
href="?wicket:bookmarkablePage=%3Auk.ac.osswatch.simal.wicket.ExhibitProjectBrow
serPage"
wicket:id="exhibitBrowserLink">Project Browser</a> 
  <a
href="?wicket:bookmarkablePage=%3Auk.ac.osswatch.simal.wicket.doap.DoapFormPage"
wicket:id="addDOAPLink">DOAP Form</a> 

</div>
<div id="content"><wicket:child><wicket:extend>
<div class="content">
  <div class="widget">
    <div class="content">
      <h2 wicket:id="projectName">3D Visualisation Network in the Arts
(3DVisA)</h2>
      <p wicket:id="shortDesc">3DVisA promotes, enhances and extends the
knowledge, understanding and opportunities enabled by digital 3D
visualisation to research and pedagogy in the Arts and Humanities.</p>
      <div wicket:id="homepages">
        <a href="http://www.viznet.ac.uk/3dvisa" wicket:id="homepage"><span
wicket:id="label">http://www.viznet.ac.uk/3dvisa</span></a>

      </div>
    </div>
  </div>

  <div class="widget">
    <h2>Details</h2>
    <div class="content">
      <p wicket:id="description">The aim of 3DVisA is to enhance and extend
3D visualisation-related knowledge, understanding and opportunities in Arts
and Humanities domains, where an increasing number of researchers,
educators, and learners are creating and/or using 3D visualisation
technologies.</p>
      <p wicket:id="releases"><wicket:panel>

</wicket:panel></p>
    </div>
  </div>

  <div class="widget">
    <h2>Community Tools</h2>
    <div class="content">

      <h3>Mailing Lists</h3>
      <ul wicket:id="mailingLists">

        <li><a href="http://3dvisa.cch.kcl.ac.uk/JISCMail.html"
wicket:id="mailingList"><span
wicket:id="label">http://3dvisa.cch.kcl.ac.uk/JISCMail.html</span></a></li>
      </ul>

      <div wicket:id="wikis">
        <a href="https://wiki.viznet.ac.uk/bin/view/"
wicket:id="wiki"><span wicket:id="label">Wiki</span></a>
      </div>

      <h2>Source repositories</h2>
      <div wicket:id="sourceRepositories"><wicket:panel>

</wicket:panel></div>

    </div>
  </div>

  <div class="widget">
    <h2>Facets</h2>
    <div class="content">
      <h2>Categories</h2>
      <div wicket:id="categories">

        <ul>
          <li><a
href="http://www.jisc.ac.uk/whatwedo/themes/eresearch.aspx"
wicket:id="category"><span
wicket:id="label">http://www.jisc.ac.uk/whatwedo/themes/eresearch.aspx</span></a
></li>
        </ul>
      </div>

      <h2>Operating System(s)</h2>

      <h2>Programming Language(s)</h2>

    </div>

  </div>

  <div class="widget">
    <h2>Contributors</h2>
    <div class="content">

      <h3>Maintainer(s)</h3>
      <div wicket:id="maintainers">
        <div wicket:id="maintainer"><wicket:panel>
  <div class="content">

    <p wicket:id="personName" class="title">Hugh Denard</p>
    <p wicket:id="homepages"></p>
  </div>
</wicket:panel></div>
      </div><div wicket:id="maintainers">
        <div wicket:id="maintainer"><wicket:panel>
  <div class="content">
    <p wicket:id="personName" class="title">Hugh Denard</p>

    <p wicket:id="homepages"></p>
  </div>
</wicket:panel></div>
      </div><div wicket:id="maintainers">
        <div wicket:id="maintainer"><wicket:panel>
  <div class="content">
    <p wicket:id="personName" class="title">Hugh Denard</p>
    <p wicket:id="homepages">http://john.doe.org</p>

  </div>
</wicket:panel></div>
      </div>

      <h3>Developer(s)</h3>
      <div wicket:id="developers">
        <div wicket:id="developer"><wicket:panel>
  <div class="content">
    <p wicket:id="personName" class="title">Julie Tolmie</p>

    <p wicket:id="homepages"></p>
  </div>
</wicket:panel></div>
      </div><div wicket:id="developers">
        <div wicket:id="developer"><wicket:panel>
  <div class="content">
    <p wicket:id="personName" class="title">Anna Bentkowska-Kafel</p>
    <p wicket:id="homepages"></p>
  </div>

</wicket:panel></div>
      </div><div wicket:id="developers">
        <div wicket:id="developer"><wicket:panel>
  <div class="content">
    <p wicket:id="personName" class="title">Richard Beacham</p>
    <p wicket:id="homepages">http://john.doe.org</p>
  </div>
</wicket:panel></div>
      </div><div wicket:id="developers">

        <div wicket:id="developer"><wicket:panel>
  <div class="content">
    <p wicket:id="personName" class="title">Anna Bentkowska-Kafel</p>
    <p wicket:id="homepages"></p>
  </div>
</wicket:panel></div>
      </div><div wicket:id="developers">
        <div wicket:id="developer"><wicket:panel>
  <div class="content">

    <p wicket:id="personName" class="title">Richard Beacham</p>
    <p wicket:id="homepages"></p>
  </div>
</wicket:panel></div>
      </div><div wicket:id="developers">
        <div wicket:id="developer"><wicket:panel>
  <div class="content">
    <p wicket:id="personName" class="title">Julie Tolmie</p>

    <p wicket:id="homepages">http://john.doe.org</p>
  </div>
</wicket:panel></div>
      </div><div wicket:id="developers">
        <div wicket:id="developer"><wicket:panel>
  <div class="content">
    <p wicket:id="personName" class="title">Julie Tolmie</p>
    <p wicket:id="homepages"></p>

  </div>
</wicket:panel></div>
      </div><div wicket:id="developers">
        <div wicket:id="developer"><wicket:panel>
  <div class="content">
    <p wicket:id="personName" class="title">Richard Beacham</p>
    <p wicket:id="homepages"></p>
  </div>
</wicket:panel></div>

      </div><div wicket:id="developers">
        <div wicket:id="developer"><wicket:panel>
  <div class="content">
    <p wicket:id="personName" class="title">Anna Bentkowska-Kafel</p>
    <p wicket:id="homepages">http://john.doe.org</p>
  </div>
</wicket:panel></div>
      </div>

      <h3>Tester(s)</h3>

      <h3>Helper(s)</h3>

      <h3>Documenter(s)</h3>

      <h3>Translator(s)</h3>

    </div>
  </div>

  <div class="widget">

    <h2>Downloads</h2>
    <div class="content">

    </div>
  </div>

  <p class="created">Created:  <span wicket:id="created">2008-3-26</span></p>
</div>
</wicket:extend></wicket:child></div>
<div id="footer"><span wicket:id="footer">This is in the footer</span></div>

</body>
</html>

Original issue reported on code.google.com by gabh...@gmail.com on 9 Apr 2008 at 5:46

GoogleCodeExporter commented 9 years ago
This doesn't occur with any of the test files.

Original comment by rgardler...@gmail.com on 9 Apr 2008 at 8:48

GoogleCodeExporter commented 9 years ago

This happened again when I uploaded a doap file, then modified it 

from 

<name>The digitisation of 100 key journals, 205 monographs and 2,500 manuscript 
pages
from core Irish Studies collections</name> 

to 

<name>A digital library of core e-resources on Ireland</name> 

then re-uploaded the modified version.

http://data.oss-watch.ac.uk:8080/?wicket:interface=:20::::

The modified doap file:

--------------------------
<?xml version="1.0" encoding="UTF-8"?>
<!--
  Copyright 2007 University of Oxford

  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
-->
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    xmlns="http://usefulinc.com/ns/doap#"
    xmlns:foaf="http://xmlns.com/foaf/0.1/"
    xmlns:labs="http://labs.apache.org/doap-ext/1.0#"
    xmlns:dc="http://purl.org/dc/elements/1.1/"
    xmlns:projects="http://projects.apache.org/ns/asfext#">

    <Project rdf:about="http://www.qub.ac.uk/cdda-old/dlcmi/">

        <name>A digital library of core e-resources on Ireland</name>

        <shortdesc xml:lang="en">The digitisation of key documents from core Irish
Studies collections.</shortdesc>

        <description xml:lang="en">The digitisation of 100 key journals, 205
monographs and 2,500 manuscript pages from core Irish Studies collections makes 
this
comprehensive, multi-disciplinary digital library the first point of contact for
scholars and students seeking a convenient and comprehensive one stop shop for
e-resources relating to Ireland.</description>

        <homepage
rdf:resource="http://www.jisc.ac.uk/whatwedo/programmes/programme_digitisation/i
reland.aspx"
/>

        <category
rdf:resource="http://www.jisc.ac.uk/whatwedo/programmes/programme_digitisation.a
spx" />

        <created>2008-3-26</created>

        <!--  <wiki
rdf:resource="http://www.pligg.com/wiki/index.php?title=Main_Page"/> -->

        <!--  <screenshots
rdf:resource="http://usefulinc.com/software/gnome-bluetooth/#shots" /> -->

        <mailing-list
rdf:resource="http://www.jiscmail.ac.uk/archives/irish-studies.html" />

        <!-- <bug-database rdf:resource="http://bugzilla.gnome.org/" />  -->

        <!-- <download-page rdf:resource="http://www.redland.opensource.ac.uk/dist/"
/> -->

        <!-- <download-mirror rdf:resource="http://sourceforge.net/projects/librdf/"
/> -->

        <!-- <license rdf:resource="http://usefulinc.com/doap/licenses/GPL" /> -->

        <!-- <programming-language xml:lang="en">PHP</programming-language> -->

            <maintainer>
            <foaf:Person>
            <foaf:title>Dr.</foaf:title>
            <foaf:name>Paul Ell</foaf:name>
            <foaf:mbox>paul.ell@qub.ac.uk</foaf:mbox>
            <foaf:homepage
rdf:resource="http://www.qub.ac.uk/schools/gap/Staff/AcademicStaff/DrPaulEll/" 
/>
            </foaf:Person>
            </maintainer>

        <maintainer>
            <foaf:Person>
            <foaf:title></foaf:title>
            <foaf:name>Deirdre Wildy</foaf:name>
            <foaf:mbox>D.Wildy@qub.ac.uk</foaf:mbox>
            <foaf:homepage
rdf:resource="http://www.qub.ac.uk/directorates/InformationServices/TheLibrary/S
tafftoContact/SubjectLibrarians/"
/>
            </foaf:Person>
            </maintainer>

            <developer>
            <foaf:Person>
            <foaf:title></foaf:title>
            <foaf:name>Elaine Yeates</foaf:name>
            <foaf:mbox>E.Yeates@qub.ac.uk</foaf:mbox>
            <foaf:homepage
rdf:resource="http://www.qub.ac.uk/schools/gap/Staff/AdministrativeStaff/ElaineY
eates/"
/>
            </foaf:Person>
            </developer>

<!-- 
            <developer>
            <foaf:Person>
            <foaf:title>Dr.</foaf:title>
            <foaf:name>FIXME: John Doe</foaf:name>
            <foaf:mbox>FIXME: email@foo.org</foaf:mbox>
            <foaf:homepage rdf:resource="http://john.doe.org" />
            </foaf:Person>
            </developer>
        -->

        <!-- 
            <release>
            <Version>
            <name>Pligg beta</name>
            <created>2007-04-29</created>
            <revision>9.5</revision>
            </Version>
            </release>
        -->

        <!-- 
            <repository>
            <SVNRepository>
            <location rdf:resource="http://svn.usefulinc.com/svn/repos/trunk/doap/" />
            <browse
rdf:resource="http://svn.usefulinc.com/cgi-bin/viewcvs.cgi/trunk/doap/" />
            </SVNRepository>
            </repository>

            <repository>
            <CVSRepository>
            <anon-root>
            pserver:anonymous@newscloud.cvs.sourceforge.net/cvsroot
            </anon-root>
            <module>newscloud</module>
            <browse
            rdf:resource="http://newscloud.cvs.sourceforge.net/newscloud" />
            </CVSRepository>
            </repository>
        -->

    </Project>
</rdf:RDF>

Original comment by gabh...@gmail.com on 11 Apr 2008 at 1:51

GoogleCodeExporter commented 9 years ago
That gives me a good idea of what the problem is.

Since the foaf:Person entries do not have an identifier Simal is creating one
automatically. However, there is no attempt to ensure that the person does not
already exist. Hence a duplicate is being entered.

This is non-trivial to fix. However, each person should have a uniwue 
identifier.

Whilst waiting for a fix in code I suggest manually adding an identifier to each
foaf:Person using an rdf:About element as follows:

<foaf:Person 
rdf:about="http://simal.oss-watch.ac.uk/defaultPersonNS#ElaineYeates>
            <foaf:title></foaf:title>
            <foaf:name>Elaine Yeates</foaf:name>
            <foaf:mbox>E.Yeates@qub.ac.uk</foaf:mbox>
            <foaf:homepage
rdf:resource="http://www.qub.ac.uk/schools/gap/Staff/AdministrativeStaff/ElaineY
eates/"
/>
            </foaf:Person>
            </developer>

At some point in the future we'll add duplicate detection and resolution on 
import.

Original comment by rgardler...@gmail.com on 11 Apr 2008 at 2:11

GoogleCodeExporter commented 9 years ago
Indeed the problem is as described above. but it is not limited to people 
entities.

It's showing its head all over.

I've put a workaround in place for the person issue. Basically we detect 
duplicate
people using the mbox_sha1sum or the rdfs:seeAlso element. The older of the two 
in a
duplication is deleted. What should really happen is we merge the two. But this 
will
do for now.

Original comment by rgardler...@gmail.com on 12 Apr 2008 at 12:38

GoogleCodeExporter commented 9 years ago
This issues requires a rewrite of the AnnotatingRDFXMLHandler.

We need to read the document into memory, keeping a map of the entities present.

We then need to work through each of those entities to ensure that they confirm 
to
the Simal rules (i.e. thay have uniquye IDs, there are no blank nodes, there 
are no
duplicates).

Original comment by rgardler...@gmail.com on 12 Apr 2008 at 10:32

GoogleCodeExporter commented 9 years ago
The Elmo Smusher may help (or at least provide inspiration), see 
http://www.openrdf.org/doc/elmo/1.0-rc1/user-guide.html#AEN622

Original comment by rgardler...@gmail.com on 12 Apr 2008 at 1:29

GoogleCodeExporter commented 9 years ago
This issue occurred again this afternoon when I altered the DOAP file according 
to
the new template and re-uploaded it: 

http://data.oss-watch.ac.uk:8080/?wicket:interface=:2::::

As a result three layers of duplicate maintainers and developers appeared 
instead of
the previous two. 

Below is the submitted DOAP file:

--------------------------------------------------

<?xml version="1.0" encoding="UTF-8"?>
<!--
  Copyright 2007 University of Oxford

  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
  You may obtain a copy of the License at

      http://www.apache.org/licenses/LICENSE-2.0

  Unless required by applicable law or agreed to in writing, software
  distributed under the License is distributed on an "AS IS" BASIS,
  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  See the License for the specific language governing permissions and
  limitations under the License.
-->
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns="http://usefulinc.com/ns/doap#" xmlns:foaf="http://xmlns.com/foaf/0.1/"
xmlns:labs="http://labs.apache.org/doap-ext/1.0#"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:projects="http://projects.apache.org/ns/asfext#">

    <Project
rdf:about="http://www.jisc.ac.uk/whatwedo/programmes/programme_eresearch/project
_3dvisa.aspx">

        <name>3D Visualisation Network in the Arts</name>

        <shortdesc xml:lang="en">Promotes, enhances and extends the knowledge,
understanding and opportunities enabled by digital 3D visualisation to research 
and
pedagogy in the Arts and Humanities.</shortdesc>

        <description xml:lang="en">The aim of 3DVisA is to enhance and extend 3D
visualisation-related knowledge, understanding and opportunities in Arts and
Humanities domains, where an increasing number of researchers, educators, and
learners are creating and/or using 3D visualisation technologies.</description>

        <homepage rdf:resource="http://www.viznet.ac.uk/3dvisa"/>
    <homepage
rdf:resource="http://www.jisc.ac.uk/whatwedo/programmes/programme_eresearch/proj
ect_3dvisa.aspx"/>

        <category rdf:resource="http://www.jisc.ac.uk/whatwedo/themes/eresearch.aspx"/>
<category
rdf:resource="http://www.jisc.ac.uk/whatwedo/programmes/programme_eresearch.aspx
"/>
<category
rdf:resource="http://www.jisc.ac.uk/aboutus/committees/sub_committees/jsr.aspx"/
>

        <created>2008-3-26</created>

         <wiki rdf:resource="https://wiki.viznet.ac.uk/bin/view/"/> 

        <!--  <screenshots
rdf:resource="http://usefulinc.com/software/gnome-bluetooth/#shots" /> -->

        <mailing-list rdf:resource="http://3dvisa.cch.kcl.ac.uk/JISCMail.html"/>  

        <!-- <bug-database rdf:resource="http://bugzilla.gnome.org/" />  -->

        <!-- <download-page rdf:resource="http://www.redland.opensource.ac.uk/dist/"
/> -->

        <!-- <download-mirror rdf:resource="http://sourceforge.net/projects/librdf/"
/> -->

        <!-- <license rdf:resource="http://usefulinc.com/doap/licenses/GPL" /> -->

        <!-- <programming-language xml:lang="en">PHP</programming-language> -->

            <maintainer>
            <foaf:Person>
            <foaf:title>Dr.</foaf:title>
            <foaf:name>Hugh Denard</foaf:name>
            <foaf:mbox rdf:resource="mailto:Hugh.Denard@kcl.ac.uk"/>
            </foaf:Person>
            </maintainer>

            <developer>
            <foaf:Person>
            <foaf:title>Prof.</foaf:title>
            <foaf:name>Richard Beacham</foaf:name>
            <foaf:mbox rdf:resource="mailto:Richard.Beacham@kcl.ac.uk"/>
            </foaf:Person>
            </developer>

            <developer>
            <foaf:Person>
            <foaf:title>Dr.</foaf:title>
            <foaf:name>Anna Bentkowska-Kafel</foaf:name>
            <foaf:mbox rdf:resource="mailto:Anna.Bentkowska@kcl.ac.uk"/>
            </foaf:Person>
            </developer>

            <developer>
            <foaf:Person>
            <foaf:title>Dr.</foaf:title>
            <foaf:name>Julie Tolmie</foaf:name>
            <foaf:mbox rdf:resource="mailto:Julie.Tolmie@kcl.ac.uk"/>
            </foaf:Person>
            </developer>

        <!-- 
            <release>
            <Version>
            <name>Pligg beta</name>
            <created>2007-04-29</created>
            <revision>9.5</revision>
            </Version>
            </release>
        -->

        <!-- 
            <repository>
            <SVNRepository>
            <location rdf:resource="http://svn.usefulinc.com/svn/repos/trunk/doap/" />
            <browse
rdf:resource="http://svn.usefulinc.com/cgi-bin/viewcvs.cgi/trunk/doap/" />
            </SVNRepository>
            </repository>

            <repository>
            <CVSRepository>
            <anon-root>
            pserver:anonymous@newscloud.cvs.sourceforge.net/cvsroot
            </anon-root>
            <module>newscloud</module>
            <browse
            rdf:resource="http://newscloud.cvs.sourceforge.net/newscloud" />
            </CVSRepository>
            </repository>
        -->

    </Project>
</rdf:RDF>

Original comment by gabh...@gmail.com on 18 Apr 2008 at 3:57

GoogleCodeExporter commented 9 years ago

Original comment by rgardler...@gmail.com on 7 May 2008 at 1:52

GoogleCodeExporter commented 9 years ago
I think this is fixed now, but we need plenty of testing.

Original comment by rgardler...@gmail.com on 19 May 2008 at 12:03

GoogleCodeExporter commented 9 years ago

Original comment by rgardler...@gmail.com on 13 Sep 2008 at 10:16