SmartDataAnalytics / SML-Bench

A Benchmark for Machine Learning from Structured Data
Apache License 2.0
21 stars 4 forks source link

Integrate RapidMiner learning system #14

Closed patrickwestphal closed 7 years ago

patrickwestphal commented 7 years ago

Make the RapidMiner tool available as learning system in SML-Bench.

patrickwestphal commented 7 years ago

The RapidMiner itself does not support reading OWL files, but there is a RapidMiner extension called LOD extension which can handle this. With this extension a work flow for doing rule induction based on positive and negative examples could look like this: rapidminer_process

Unfortunately there seems to be no way to talk to the RapidMiner studio from an external program like our run scripts. Thus the RapidMiner server needs to be brought into action. Using the RapidMiner server any program/agent can talk to RapidMiner via SOAP messages. The above work flow can then be expressed in XML as follows:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.1.001">
<context>
  <input/>
  <output/>
  <macros/>
</context>
<operator activated="true" class="process" compatibility="7.1.001" expanded="true" name="Process">
  <parameter key="logverbosity" value="init"/>
  <parameter key="random_seed" value="2001"/>
  <parameter key="send_mail" value="never"/>
  <parameter key="notification_email" value=""/>
  <parameter key="process_duration_for_mail" value="30"/>
  <parameter key="encoding" value="SYSTEM"/>
  <process expanded="true">
    <operator activated="true" class="lod:SPARQL Data Importer" compatibility="1.5.012" expanded="true" height="68" name="SPARQL Data Importer" width="90" x="246" y="34">
      <parameter key="SPARQL connection" value="DBpedia"/>
      <parameter key="SPARQL query" value="SELECT ?s ?p ?o&#10;WHERE {&#10;  ?s a &lt;http://dbpedia.org/ontology/Person&gt; .&#10;  ?s ?p ?o&#10;}"/>
    </operator>
    <operator activated="true" class="retrieve" compatibility="7.1.001" expanded="true" height="68" name="Retrieve examples2" width="90" x="246" y="136">
      <parameter key="repository_entry" value="data/examples2"/>
    </operator>
    <operator activated="true" class="join" compatibility="7.1.001" expanded="true" height="82" name="Join" width="90" x="380" y="85">
      <parameter key="remove_double_attributes" value="true"/>
      <parameter key="join_type" value="outer"/>
      <parameter key="use_id_attribute_as_key" value="false"/>
      <list key="key_attributes">
        <parameter key="s" value="s"/>
      </list>
      <parameter key="keep_both_join_attributes" value="false"/>
    </operator>
    <operator activated="true" class="replace_missing_values" compatibility="7.1.001" expanded="true" height="103" name="Replace Missing Values" width="90" x="514" y="85">
      <parameter key="return_preprocessing_model" value="false"/>
      <parameter key="create_view" value="true"/>
      <parameter key="attribute_filter_type" value="single"/>
      <parameter key="attribute" value="label"/>
      <parameter key="attributes" value="label"/>
      <parameter key="use_except_expression" value="false"/>
      <parameter key="value_type" value="attribute_value"/>
      <parameter key="use_value_type_exception" value="false"/>
      <parameter key="except_value_type" value="time"/>
      <parameter key="block_type" value="attribute_block"/>
      <parameter key="use_block_type_exception" value="false"/>
      <parameter key="except_block_type" value="value_matrix_row_start"/>
      <parameter key="invert_selection" value="false"/>
      <parameter key="include_special_attributes" value="true"/>
      <parameter key="default" value="value"/>
      <list key="columns"/>
      <parameter key="replenishment_value" value="0"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="7.1.001" expanded="true" height="82" name="Set Role" width="90" x="648" y="85">
      <parameter key="attribute_name" value="label"/>
      <parameter key="target_role" value="label"/>
      <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="rule_induction" compatibility="7.1.001" expanded="true" height="82" name="Rule Induction" width="90" x="782" y="85">
      <parameter key="criterion" value="information_gain"/>
      <parameter key="sample_ratio" value="0.9"/>
      <parameter key="pureness" value="0.9"/>
      <parameter key="minimal_prune_benefit" value="0.25"/>
      <parameter key="use_local_random_seed" value="false"/>
      <parameter key="local_random_seed" value="1992"/>
    </operator>
    <connect from_op="SPARQL Data Importer" from_port="Example Set" to_op="Join" to_port="left"/>
    <connect from_op="Retrieve examples2" from_port="output" to_op="Join" to_port="right"/>
    <connect from_op="Join" from_port="join" to_op="Replace Missing Values" to_port="example set input"/>
    <connect from_op="Replace Missing Values" from_port="example set output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Rule Induction" to_port="training set"/>
    <connect from_op="Rule Induction" from_port="model" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
  </process>
</operator>
</process>

and sent to the server via a POST request to http://<server_host>/api/soap/RepositoryService with this payload:

<?xml version="1.0" ?>
<S:Envelope xmlns:S="http://schemas.xmlsoap.org/soap/envelope/">
<S:Body>
    <ns2:storeProcess xmlns:ns2="http://service.web.rapidanalytics.de/">
        <entryLocation>/home/admin/trial1</entryLocation>
        <processXML>&lt;?xml version="1.0" encoding="UTF-8" standalone="no"?&gt;
&lt;process version="7.1.001"&gt;
&lt;context&gt;
&lt;input/&gt;
&lt;output/&gt;
&lt;macros/&gt;
&lt;/context&gt;
&lt;operator activated="true" class="process" compatibility="7.1.001" expanded="true" name="Process"&gt;
&lt;parameter key="logverbosity" value="init"/&gt;
&lt;parameter key="random_seed" value="2001"/&gt;
&lt;parameter key="send_mail" value="never"/&gt;
&lt;parameter key="notification_email" value=""/&gt;
&lt;parameter key="process_duration_for_mail" value="30"/&gt;
&lt;parameter key="encoding" value="SYSTEM"/&gt;
&lt;process expanded="true"&gt;
  &lt;operator activated="true" class="lod:SPARQL Data Importer" compatibility="1.5.012" expanded="true" height="68" name="SPARQL Data Importer" width="90" x="246" y="34"&gt;
    &lt;parameter key="SPARQL connection" value="DBpedia"/&gt;
    &lt;parameter key="SPARQL query" value="SELECT ?s ?p ?o&amp;#10;WHERE {&amp;#10;  ?s a &amp;lt;http://dbpedia.org/ontology/Person&amp;gt; .&amp;#10;  ?s ?p ?o&amp;#10;}"/&gt;
  &lt;/operator&gt;
  &lt;operator activated="true" class="retrieve" compatibility="7.1.001" expanded="true" height="68" name="Retrieve examples2" width="90" x="246" y="136"&gt;
    &lt;parameter key="repository_entry" value="data/examples2"/&gt;
  &lt;/operator&gt;
  &lt;operator activated="true" class="join" compatibility="7.1.001" expanded="true" height="82" name="Join" width="90" x="380" y="85"&gt;
    &lt;parameter key="remove_double_attributes" value="true"/&gt;
    &lt;parameter key="join_type" value="outer"/&gt;
    &lt;parameter key="use_id_attribute_as_key" value="false"/&gt;
    &lt;list key="key_attributes"&gt;
      &lt;parameter key="s" value="s"/&gt;
    &lt;/list&gt;
    &lt;parameter key="keep_both_join_attributes" value="false"/&gt;
  &lt;/operator&gt;
  &lt;operator activated="true" class="replace_missing_values" compatibility="7.1.001" expanded="true" height="103" name="Replace Missing Values" width="90" x="514" y="85"&gt;
    &lt;parameter key="return_preprocessing_model" value="false"/&gt;
    &lt;parameter key="create_view" value="true"/&gt;
    &lt;parameter key="attribute_filter_type" value="single"/&gt;
    &lt;parameter key="attribute" value="label"/&gt;
    &lt;parameter key="attributes" value="label"/&gt;
    &lt;parameter key="use_except_expression" value="false"/&gt;
    &lt;parameter key="value_type" value="attribute_value"/&gt;
    &lt;parameter key="use_value_type_exception" value="false"/&gt;
    &lt;parameter key="except_value_type" value="time"/&gt;
    &lt;parameter key="block_type" value="attribute_block"/&gt;
    &lt;parameter key="use_block_type_exception" value="false"/&gt;
    &lt;parameter key="except_block_type" value="value_matrix_row_start"/&gt;
    &lt;parameter key="invert_selection" value="false"/&gt;
    &lt;parameter key="include_special_attributes" value="true"/&gt;
    &lt;parameter key="default" value="value"/&gt;
    &lt;list key="columns"/&gt;
    &lt;parameter key="replenishment_value" value="0"/&gt;
  &lt;/operator&gt;
  &lt;operator activated="true" class="set_role" compatibility="7.1.001" expanded="true" height="82" name="Set Role" width="90" x="648" y="85"&gt;
    &lt;parameter key="attribute_name" value="label"/&gt;
    &lt;parameter key="target_role" value="label"/&gt;
    &lt;list key="set_additional_roles"/&gt;
  &lt;/operator&gt;
  &lt;operator activated="true" class="rule_induction" compatibility="7.1.001" expanded="true" height="82" name="Rule Induction" width="90" x="782" y="85"&gt;
    &lt;parameter key="criterion" value="information_gain"/&gt;
    &lt;parameter key="sample_ratio" value="0.9"/&gt;
    &lt;parameter key="pureness" value="0.9"/&gt;
    &lt;parameter key="minimal_prune_benefit" value="0.25"/&gt;
    &lt;parameter key="use_local_random_seed" value="false"/&gt;
    &lt;parameter key="local_random_seed" value="1992"/&gt;
  &lt;/operator&gt;
  &lt;connect from_op="SPARQL Data Importer" from_port="Example Set" to_op="Join" to_port="left"/&gt;
  &lt;connect from_op="Retrieve examples2" from_port="output" to_op="Join" to_port="right"/&gt;
  &lt;connect from_op="Join" from_port="join" to_op="Replace Missing Values" to_port="example set input"/&gt;
  &lt;connect from_op="Replace Missing Values" from_port="example set output" to_op="Set Role" to_port="example set input"/&gt;
  &lt;connect from_op="Set Role" from_port="example set output" to_op="Rule Induction" to_port="training set"/&gt;
  &lt;connect from_op="Rule Induction" from_port="model" to_port="result 1"/&gt;
  &lt;portSpacing port="source_input 1" spacing="0"/&gt;
  &lt;portSpacing port="sink_result 1" spacing="0"/&gt;
  &lt;portSpacing port="sink_result 2" spacing="0"/&gt;
&lt;/process&gt;
&lt;/operator&gt;
&lt;/process&gt;
        </processXML>
    </ns2:storeProcess>
</S:Body>
</S:Envelope>

Afterwards the overall learning scenario can be executed by sending

<?xml version="1.0" ?>
<S:Envelope xmlns:S="http://schemas.xmlsoap.org/soap/envelope/">
  <S:Body>
      <ns2:executeProcessSimple xmlns:ns2="http://service.web.rapidanalytics.de/">
          <processLocation>/home/admin/trial1</processLocation>
          <processContext></processContext>
      </ns2:executeProcessSimple>
  </S:Body>
</S:Envelope>

to http://<server_host>/api/soap/ProcessService, again as POST request.

For this approach to work, a so called connection to a SPARQL endpoint (called 'DBpedia' in the example above) or to a local file would have to be created to be able to read RDF triples. In our case this means creating a file connection for each of the learning task's OWL files. This comes with two severe problems/restrictions:

Considering these problems we gave up trying to include RapidMiner.