JannisRehbock / TopicExplorer

TopicExplorer
GNU Affero General Public License v3.0
0 stars 0 forks source link

Einlesen von SQL Dateien #4

Open JannisRehbock opened 6 years ago

JannisRehbock commented 6 years ago

Ideen zum Einlesen von SQL-Dateien in Spark.

hinneburg commented 6 years ago
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.text.StrSubstitutor;
import org.apache.commons.text.StringSubstitutor;

private static String readFileFromClasspath(String fileName) throws IOException {
        InputStream fis = RandomProjection.class.getResourceAsStream("/" + fileName);
        return IOUtils.toString(fis, "UTF-8");
    }

private static Dataset<Row> createDataSetFromSqlFile(String viewName, SparkSession spark)
            throws IOException, AnalysisException {
        Dataset<Row> result = spark.sql(readFileFromClasspath(viewName + ".sql"));
        return result;
    }

private static Dataset<Row> createTempViewFromSqlFile(String viewName, SparkSession spark)
            throws IOException, AnalysisException {
        Dataset<Row> result = createDataSetFromSqlFile(viewName, spark);
        result.createTempView(viewName);
        return result;
    }

private static Dataset<Row> createDataSetFromSqlFile(String viewName, HashMap<String, String> parameters,
            SparkSession spark) throws IOException, AnalysisException {
        String template = readFileFromClasspath(viewName + ".sql");
        Dataset<Row> result = spark.sql(StringSubstitutor.replace(template, parameters));
        return result;
    }

....
        HashMap<String, String> queryParameters = new HashMap<String, String>();
            queryParameters.put("resultPosStringList", resultPosList);
            queryParameters.put("queryWordString", query.queryWord);
            queryParameters.put("queryPosString", query.queryPos);
            System.out.println(queryParameters.toString());
            Dataset<Row> wordSimilarity = createDataSetFromSqlFile("word_similarity", queryParameters, spark);

Sql File word_similarity.sql

SELECT
  /*+ BROADCAST(cvl_a), BROADCAST(cvl_b) */
  lemma_a
  , pos_a
  , lemma_b
  , pos_b
  , scalar_product / ( sqrt(cvl_a.length_square) * sqrt(cvl_b.length_square) ) as sim  
from (
     SELECT
       /*+ BROADCAST(word_a), BROADCAST(word_b) */
       word_a.lemma as lemma_a
       , word_a.pos as pos_a
       , word_b.lemma as lemma_b
       , word_b.pos as pos_b
       , sum(word_a.value*word_b.value) as scalar_product 
     from 
       context_vector4 word_a 
       join context_vector4 word_b 
       on (word_a.dimension=word_b.dimension) 
     where 
       word_a.lemma= ${queryWordString:-'year'} 
       and word_a.pos= ${queryPosString:-'NN'}
       AND word_b.pos IN ( ${resultPosStringList:-'NN'} ) 
     group by 
       word_a.lemma
       , word_a.pos
       , word_b.lemma
       , word_b.pos 
    ) sim
    join context_vector_length_square cvl_a on (lemma_a=cvl_a.lemma and pos_a=cvl_a.pos) 
    join context_vector_length_square cvl_b on (lemma_b=cvl_b.lemma and pos_b=cvl_b.pos)
order by sim desc