Error when running PC alg (some CSV file)

Hi, As you know, I have installed most of the packages and attempted to run PC alg on sachs as well as fsgnn. I thought I was out of trouble as the R setup looked fine, but when calling the following python3.6 snippet

from cdt.causality.graph.PC import PC

pc = PC(CItest="hsic",method_indep="rcit")
pcgraph = pc.predict(data)

The interpreter spat out the following error message:

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

FileNotFoundError: [Errno 2] File b'/tmp/cdt_pc_0f4d2452-ff3c-4a84-81dd-73ab5b4d474b//result.csv' does not exist: b'/tmp/cdt_pc_0f4d2452-ff3c-4a84-81dd-73ab5b4d474b//result.csv'

Any idea of what this is about?

Regards, A.V

NB: here is the full error printout

--------------------------------------------------------------------
FileNotFoundError                  Traceback (most recent call last)
<ipython-input-50-db1fca6dbddc> in <module>
----> 1 pcgraph = pc.predict(data)

~/progtools/python/virtualenvs/tfcuda/lib/python3.6/site-packages/cdt/causality/graph/model.py in predict(self, df_data, graph, **kwargs)
     61         """
     62         if graph is None:
---> 63             return self.create_graph_from_data(df_data, **kwargs)
     64         elif isinstance(graph, nx.DiGraph):
     65             return self.orient_directed_graph(df_data, graph, **kwargs)

~/progtools/python/virtualenvs/tfcuda/lib/python3.6/site-packages/cdt/causality/graph/PC.py in create_graph_from_data(self, data, **kwargs)
    257         self.arguments['{VERBOSE}'] = str(self.verbose).upper()
    258 
--> 259         results = self._run_pc(data, verbose=self.verbose)
    260 
    261         return nx.relabel_nodes(nx.DiGraph(results),

~/progtools/python/virtualenvs/tfcuda/lib/python3.6/site-packages/cdt/causality/graph/PC.py in _run_pc(self, data, fixedEdges, fixedGaps, verbose)
    300         except Exception as e:
    301             rmtree(run_dir)
--> 302             raise e
    303         except KeyboardInterrupt:
    304             rmtree(run_dir)

~/progtools/python/virtualenvs/tfcuda/lib/python3.6/site-packages/cdt/causality/graph/PC.py in _run_pc(self, data, fixedEdges, fixedGaps, verbose)
    296 
    297             pc_result = launch_R_script("{}/R_templates/pc.R".format(os.path.dirname(os.path.realpath(__file__))),
--> 298                                         self.arguments, output_function=retrieve_result, verbose=verbose)
    299         # Cleanup
    300         except Exception as e:

~/progtools/python/virtualenvs/tfcuda/lib/python3.6/site-packages/cdt/utils/R.py in launch_R_script(template, arguments, output_function, verbose, debug)
    198         if not debug:
    199             rmtree(base_dir)
--> 200         raise e
    201     except KeyboardInterrupt:
    202         if not debug:

~/progtools/python/virtualenvs/tfcuda/lib/python3.6/site-packages/cdt/utils/R.py in launch_R_script(template, arguments, output_function, verbose, debug)
    192                                            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    193             process.wait()
--> 194             output = output_function()
    195 
    196     # Cleaning up

~/progtools/python/virtualenvs/tfcuda/lib/python3.6/site-packages/cdt/causality/graph/PC.py in retrieve_result()
    284 
    285         def retrieve_result():
--> 286             return read_csv('{}/result.csv'.format(run_dir), delimiter=',').values
    287 
    288         try:

~/progtools/python/virtualenvs/tfcuda/lib/python3.6/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    683         )
    684 
--> 685         return _read(filepath_or_buffer, kwds)
    686 
    687     parser_f.__name__ = name

~/progtools/python/virtualenvs/tfcuda/lib/python3.6/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    455 
    456     # Create the parser.
--> 457     parser = TextFileReader(fp_or_buf, **kwds)
    458 
    459     if chunksize or iterator:

~/progtools/python/virtualenvs/tfcuda/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    893             self.options["has_index_names"] = kwds["has_index_names"]
    894 
--> 895         self._make_engine(self.engine)
    896 
    897     def close(self):

~/progtools/python/virtualenvs/tfcuda/lib/python3.6/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1133     def _make_engine(self, engine="c"):
   1134         if engine == "c":
-> 1135             self._engine = CParserWrapper(self.f, **self.options)
   1136         else:
   1137             if engine == "python":

~/progtools/python/virtualenvs/tfcuda/lib/python3.6/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1915         kwds["usecols"] = self.usecols
   1916 
-> 1917         self._reader = parsers.TextReader(src, **kwds)
   1918         self.unnamed_cols = self._reader.unnamed_cols
   1919 

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

FileNotFoundError: [Errno 2] File b'/tmp/cdt_pc_0f4d2452-ff3c-4a84-81dd-73ab5b4d474b//result.csv' does not exist: b'/tmp/cdt_pc_0f4d2452-ff3c-4a84-81dd-73ab5b4d474b//result.csv'

If my understanding is correct, either pc.R does not write results (because it fails to do so, or is simply not called) ; or PC.py does not know how/where to get the results back.

More detailed:

Check the fact R package can load
```
dfpkg = DefaultRPackages()
dfpkg.check_R_package('some_package')
```
Returns True for all the packages needed.
The scripts seem to be writing to funny paths such as /tmp/cdt_pc_ea47a7a6-85c3-4624-a8a8-144b68ae72b0//data.csv. Wouldn't the double-slash (<something>//<something>) mess things up?
The problem seems to come from the launch_R_script function from R.py

Update: problem comes from the R script pc.R. The following pops on the terminal when the process is called:

Loading required package: momentchi2
Loading required package: MASS
Casting arguments...
Error in skeleton(suffStat, indepTest, alpha, labels = labels, method = skel.method,  : 
  Evaluation error: object 'pval' not found.
Calls: runPC -> <Anonymous> -> skeleton
Execution halted

Investigating further...

Strangely enough, the CRAN pcalg doc doesn't list any pval variable for the skeleton function.. Edit: this is what I found through R's getAnywhere() : the relevant bit of the skeleton function is:

                  if (length_nbrs >= ord) {
                    if (length_nbrs > ord) 
                      done <- FALSE
                    S <- seq_len(ord)
                    repeat {
                      n.edgetests[ord1] <- n.edgetests[ord1] + 
                        1
                      pval <- indepTest(x, y, nbrs[S], suffStat)
                      if (verbose) 
                        cat("x=", x, " y=", y, " S=", nbrs[S], 
                          ": pval =", pval, "\n")
                      if (is.na(pval)) 
                        pval <- as.numeric(NAdelete)
                      if (pMax[x, y] < pval) 
                        pMax[x, y] <- pval
                      if (pval >= alpha) {
                        G[x, y] <- G[y, x] <- FALSE
                        sepset[[x]][[y]] <- nbrs[S]
                        break
                      }
                      else {
                        nextSet <- getNextSet(length_nbrs, ord, 
                          S)
                        if (nextSet$wasLast) 
                          break
                        S <- nextSet$nextSet
                      }

What do you think? Some problem related to the IndepTest itself, or the way it is passed to R

Ok, went to check the pc.R file, and something bugs me. First, you define the PC options this way

# additional options for PC
  optionsList <- list("indepTest"={CITEST}, "fixedEdges"=fixedEdges,
                      "NAdelete"=TRUE, "m.max"=Inf, "u2pd" = "relaxed",
                      "skel.method"= "stable.fast", "conservative"=FALSE,
                      "maj.rule"=TRUE, "solve.confl"=FALSE, numCores={NJOBS})

which becomes (after your template editing) in my case

  # additional options for PC
  optionsList <- list("indepTest"=kpcalg::kernelCItest, "fixedEdges"=fixedEdges,
                      "NAdelete"=TRUE, "m.max"=Inf, "u2pd" = "relaxed",
                      "skel.method"= "stable.fast", "conservative"=FALSE,
                      "maj.rule"=TRUE, "solve.confl"=FALSE, numCores=1)

But then, both in the template pc.R and after-editing, I find that the last function call

result <- runPC(dataset, suffStat = NULL, parentsOf, alpha,
               variableSelMat, setOptions,
               directed, verbose, fixedEdges, fixedGaps, CI_test)

is made using some variable CI_test which I cannot find in any of the two files?

Would that be the problem ? (as you know that pval , the output of indepTest is the root of my issue)

Any clue would help me greatly!

Edit: So far, the variable CI_test takes the place of result in the function call, which is immediately set to an empty list through

result <- vector("list", length = length(parentsOf))

I do not see the point of this? And why give result as function argument; and why give CI_test such a suggestive name ? Much confusion

Hi, Thanks for the detailed bug report, I'll look into this The strange point is that the Continuous integration tool did not report any bug....

Best, Diviyan

Hi All,

I'll just note that I encounter this error for all R-dependent cdt.causality.graph methods. The problem persists with custom data.

Running the basic tutorial code verbatim:

PC is ran on the skeleton of the given graph.
---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
 in 
----> 1 model.predict(data, new_skeleton)

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/cdt/causality/graph/model.py in predict(self, df_data, graph, **kwargs)
     63             return self.create_graph_from_data(df_data, **kwargs)
     64         elif isinstance(graph, nx.DiGraph):
---> 65             return self.orient_directed_graph(df_data, graph, **kwargs)
     66         elif isinstance(graph, nx.Graph):
     67             return self.orient_undirected_graph(df_data, graph, **kwargs)

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/cdt/causality/graph/PC.py in orient_directed_graph(self, data, graph, *args, **kwargs)
    240         """
    241         warnings.warn("PC is ran on the skeleton of the given graph.")
--> 242         return self.orient_undirected_graph(data, nx.Graph(graph), *args, **kwargs)
    243 
    244     def create_graph_from_data(self, data, **kwargs):

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/cdt/causality/graph/PC.py in orient_undirected_graph(self, data, graph, **kwargs)
    219         fg = DataFrame(1 - fe.values)
    220 
--> 221         results = self._run_pc(data, fixedEdges=fe, fixedGaps=fg, verbose=self.verbose)
    222 
    223         return nx.relabel_nodes(nx.DiGraph(results),

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/cdt/causality/graph/PC.py in _run_pc(self, data, fixedEdges, fixedGaps, verbose)
    302         except Exception as e:
    303             rmtree(run_dir)
--> 304             raise e
    305         except KeyboardInterrupt:
    306             rmtree(run_dir)

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/cdt/causality/graph/PC.py in _run_pc(self, data, fixedEdges, fixedGaps, verbose)
    298 
    299             pc_result = launch_R_script("{}/R_templates/pc.R".format(os.path.dirname(os.path.realpath(__file__))),
--> 300                                         self.arguments, output_function=retrieve_result, verbose=verbose)
    301         # Cleanup
    302         except Exception as e:

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/cdt/utils/R.py in launch_R_script(template, arguments, output_function, verbose, debug)
    198         if not debug:
    199             rmtree(base_dir)
--> 200         raise e
    201     except KeyboardInterrupt:
    202         if not debug:

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/cdt/utils/R.py in launch_R_script(template, arguments, output_function, verbose, debug)
    192                                            stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
    193             process.wait()
--> 194             output = output_function()
    195 
    196     # Cleaning up

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/cdt/causality/graph/PC.py in retrieve_result()
    286 
    287         def retrieve_result():
--> 288             return read_csv('{}/result.csv'.format(run_dir), delimiter=',').values
    289 
    290         try:

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/pandas/io/parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    683         )
    684 
--> 685         return _read(filepath_or_buffer, kwds)
    686 
    687     parser_f.__name__ = name

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/pandas/io/parsers.py in _read(filepath_or_buffer, kwds)
    455 
    456     # Create the parser.
--> 457     parser = TextFileReader(fp_or_buf, **kwds)
    458 
    459     if chunksize or iterator:

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, f, engine, **kwds)
    893             self.options["has_index_names"] = kwds["has_index_names"]
    894 
--> 895         self._make_engine(self.engine)
    896 
    897     def close(self):

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/pandas/io/parsers.py in _make_engine(self, engine)
   1133     def _make_engine(self, engine="c"):
   1134         if engine == "c":
-> 1135             self._engine = CParserWrapper(self.f, **self.options)
   1136         else:
   1137             if engine == "python":

~/.pyenv/versions/3.7.3/envs/cause_env/lib/python3.7/site-packages/pandas/io/parsers.py in __init__(self, src, **kwds)
   1915         kwds["usecols"] = self.usecols
   1916 
-> 1917         self._reader = parsers.TextReader(src, **kwds)
   1918         self.unnamed_cols = self._reader.unnamed_cols
   1919 

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()

FileNotFoundError: [Errno 2] File b'/var/folders/r0/l_20fjfn6vs6l848f3jfysj00000gn/T/cdt_pc_b91480ab-9011-49ca-8f51-0f3aa9c5d861//result.csv' does not exist: b'/var/folders/r0/l_20fjfn6vs6l848f3jfysj00000gn/T/cdt_pc_b91480ab-9011-49ca-8f51-0f3aa9c5d861//result.csv'

Sorry for the delay. To solve @ArnoVel 's issue, it originates from :

pc = PC(CItest="hsic",method_indep="rcit")

These two options are incompatible. to use RCIT, you should use CItest='randomized'. I should change the API to make it simpler to avoid these issues.

@nedarb708 , I think your issue is different; could you explain your setup ? Please provide the lines that error. This FilenotFoundError originates from the R process that errors. Are you running it in a docker ?

@Diviyan-Kalainathan Thanks for this, it makes sense ! I do think however my intuition of first picking the type of test, and then specifying that I wanted to use the randomized version to approximate the HSIC test makes sense in terms of design? Anyways, thank you for the explanation :)

It should be much easier to understand, and errors might be avoided in the future :)

FenTechSolutions / CausalDiscoveryToolbox

Error when running PC alg (some CSV file) #50