Source code for cdt.causality.graph.CAM

"""CAM algorithm.

Imported from the Pcalg package.
Author: Diviyan Kalainathan

.. MIT License
..
.. Copyright (c) 2018 Diviyan Kalainathan
..
.. Permission is hereby granted, free of charge, to any person obtaining a copy
.. of this software and associated documentation files (the "Software"), to deal
.. in the Software without restriction, including without limitation the rights
.. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
.. copies of the Software, and to permit persons to whom the Software is
.. furnished to do so, subject to the following conditions:
..
.. The above copyright notice and this permission notice shall be included in all
.. copies or substantial portions of the Software.
..
.. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
.. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
.. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
.. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
.. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
.. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
.. SOFTWARE.
"""
import os
import uuid
import warnings
import platform
import networkx as nx
from pathlib import Path
from shutil import rmtree
from pandas import read_csv
from tempfile import gettempdir
from .model import GraphModel
from ...utils.Settings import SETTINGS
from ...utils.R import RPackages, launch_R_script


def message_warning(msg, *a, **kwargs):
    """Ignore everything except the message."""
    return str(msg) + '\n'


warnings.formatwarning = message_warning


[docs]class CAM(GraphModel): r"""CAM algorithm **[R model]**. **Description:** Causal Additive models, a causal discovery algorithm relying on fitting Gaussian Processes on data, while considering all noises additives and additive contributions of variables. **Required R packages**: CAM **Data Type:** Continuous **Assumptions:** The data follows a generalized additive noise model: each variable :math:`X_i` in the graph :math:`\mathcal{G}` is generated following the model :math:`X_i = \sum_{X_j \in \mathcal{G}} f(X_j) + \epsilon_i`, :math:`\epsilon_i` representing mutually independent noises variables accounting for unobserved variables. Args: score (str): Score used to fit the gaussian processes. cutoff (float): threshold value for variable selection. variablesel (bool): Perform a variable selection step. selmethod (str): Method used for variable selection. pruning (bool): Perform an initial pruning step. prunmethod (str): Method used for pruning. njobs (int): Number of jobs to run in parallel. verbose (bool): Sets the verbosity of the output. Available scores: + nonlinear: 'SEMGAM' + linear: 'SEMLIN' Available variable selection methods: + gamboost': 'selGamBoost' + gam': 'selGam' + lasso': 'selLasso' + linear': 'selLm' + linearboost': 'selLmBoost' Default Parameters: + FILE: '/tmp/cdt_CAM/data.csv' + SCORE: 'SEMGAM' + VARSEL: 'TRUE' + SELMETHOD: 'selGamBoost' + PRUNING: 'TRUE' + PRUNMETHOD: 'selGam' + NJOBS: str(SETTINGS.NJOBS) + CUTOFF: str(0.001) + VERBOSE: 'FALSE' + OUTPUT: '/tmp/cdt_CAM/result.csv' .. note:: Ref: Bühlmann, P., Peters, J., & Ernest, J. (2014). CAM: Causal additive models, high-dimensional order search and penalized regression. The Annals of Statistics, 42(6), 2526-2556. .. warning:: This implementation of CAM does not support starting with a graph. The adaptation will be made at a later date. Example: >>> import networkx as nx >>> from cdt.causality.graph import CAM >>> from cdt.data import load_dataset >>> data, graph = load_dataset("sachs") >>> obj = CAM() >>> output = obj.predict(data) """ def __init__(self, score='nonlinear', cutoff=0.001, variablesel=True, selmethod='gamboost', pruning=False, prunmethod='gam', njobs=None, verbose=None): """Init the model and its available arguments.""" if not RPackages.CAM: raise ImportError("R Package CAM is not available.") super(CAM, self).__init__() self.scores = {'nonlinear': 'SEMGAM', 'linear': 'SEMLIN'} self.var_selection = {'gamboost': 'selGamBoost', 'gam': 'selGam', 'lasso': 'selLasso', 'linear': 'selLm', 'linearboost': 'selLmBoost'} self.arguments = {'{FOLDER}': '/tmp/cdt_CAM/', '{FILE}': os.sep + 'data.csv', '{SCORE}': 'SEMGAM', '{VARSEL}': 'TRUE', '{SELMETHOD}': 'selGamBoost', '{PRUNING}': 'TRUE', '{PRUNMETHOD}': 'selGam', '{NJOBS}': str(SETTINGS.NJOBS), '{CUTOFF}': str(0.001), '{VERBOSE}': 'FALSE', '{OUTPUT}': os.sep + 'result.csv'} self.score = score self.cutoff = cutoff self.variablesel = variablesel self.selmethod = selmethod self.pruning = pruning self.prunmethod = prunmethod self.njobs = SETTINGS.get_default(njobs=njobs) self.verbose = SETTINGS.get_default(verbose=verbose) def orient_undirected_graph(self, data, graph, score='obs', verbose=False, **kwargs): """Run CAM on an undirected graph.""" # Building setup w/ arguments. raise ValueError("CAM cannot (yet) be ran with a skeleton/directed graph.") def orient_directed_graph(self, data, graph, *args, **kwargs): """Run CAM on a directed_graph.""" raise ValueError("CAM cannot (yet) be ran with a skeleton/directed graph.")
[docs] def create_graph_from_data(self, data, **kwargs): """Apply causal discovery on observational data using CAM. Args: data (pandas.DataFrame): DataFrame containing the data Returns: networkx.DiGraph: Solution given by the CAM algorithm. """ # Building setup w/ arguments. self.arguments['{SCORE}'] = self.scores[self.score] self.arguments['{CUTOFF}'] = str(self.cutoff) self.arguments['{VARSEL}'] = str(self.variablesel).upper() self.arguments['{SELMETHOD}'] = self.var_selection[self.selmethod] self.arguments['{PRUNING}'] = str(self.pruning).upper() self.arguments['{PRUNMETHOD}'] = self.var_selection[self.prunmethod] self.arguments['{NJOBS}'] = str(self.njobs) self.arguments['{VERBOSE}'] = str(self.verbose).upper() results = self._run_cam(data, verbose=self.verbose) return nx.relabel_nodes(nx.DiGraph(results), {idx: i for idx, i in enumerate(data.columns)})
def _run_cam(self, data, fixedGaps=None, verbose=False): """Setting up and running CAM with all arguments.""" # Run CAM if platform.system() == "Windows": self.arguments['{NJOBS}'] = str(1) self.arguments['{FOLDER}'] = Path('{0!s}/cdt_cam_{1!s}/'.format(gettempdir(), uuid.uuid4())) run_dir = self.arguments['{FOLDER}'] os.makedirs(run_dir, exist_ok=True) def retrieve_result(): return read_csv(Path('{}/result.csv'.format(run_dir)), delimiter=',').values try: data.to_csv(Path('{}/data.csv'.format(run_dir)), header=False, index=False) cam_result = launch_R_script(Path("{}/R_templates/cam.R".format(os.path.dirname(os.path.realpath(__file__)))), self.arguments, output_function=retrieve_result, verbose=verbose) # Cleanup except Exception as e: rmtree(run_dir) raise e except KeyboardInterrupt: rmtree(run_dir) raise KeyboardInterrupt rmtree(run_dir) return cam_result