"""Feature selection model with generative models.
Algorithm between SAM and CGNN
Author : Diviyan Kalainathan & Olivier Goudet
.. MIT License
..
.. Copyright (c) 2018 Diviyan Kalainathan
..
.. Permission is hereby granted, free of charge, to any person obtaining a copy
.. of this software and associated documentation files (the "Software"), to deal
.. in the Software without restriction, including without limitation the rights
.. to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
.. copies of the Software, and to permit persons to whom the Software is
.. furnished to do so, subject to the following conditions:
..
.. The above copyright notice and this permission notice shall be included in all
.. copies or substantial portions of the Software.
..
.. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
.. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
.. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
.. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
.. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
.. OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
.. SOFTWARE.
"""
import torch as th
import numpy as np
import networkx as nx
from torch.utils.data import Dataset, TensorDataset
from sklearn.preprocessing import scale
from tqdm import trange
from .model import FeatureSelectionModel
from ...utils.Settings import SETTINGS
from ...utils.loss import MMDloss
from ...utils.parallel import parallel_run_generator
[docs]class FSGNN_model(th.nn.Module):
"""Variant of CGNN for feature selection.
Args:
sizes (list): Size of the neural network layers
dropout (float): Dropout rate of the neural connections
activation_function (torch.nn.Module): Activation function of the network
"""
def __init__(self, sizes, dropout=0., activation_function=th.nn.ReLU):
super(FSGNN_model, self).__init__()
layers = []
for i, j in zip(sizes[:-2], sizes[1:-1]):
layers.append(th.nn.Linear(i, j))
if dropout != 0.:
layers.append(th.nn.Dropout(p=dropout))
layers.append(activation_function())
layers.append(th.nn.Linear(sizes[-2], sizes[-1]))
self.layers = th.nn.Sequential(*layers)
self.sizes = sizes
[docs] def forward(self, x):
"""Forward pass in the network.
Args:
x (torch.Tensor): input data
Returns:
torch.Tensor: output of the network
"""
self.layers(x)
[docs] def train(self, dataset, lr=0.01, l1=0.1, batch_size=-1,
train_epochs=1000, test_epochs=1000, device=None,
verbose=None, dataloader_workers=0):
"""Train the network and output the scores of the features
Args:
dataset (torch.utils.data.Dataset): Original data
lr (float): Learning rate
l1 (float): Coefficient of the L1 regularization
batch_size (int): Batch size of the model, defaults to the dataset size.
train_epochs (int): Number of train epochs
test_epochs (int): Number of test epochs
device (str): Device on which the computation is to be run
verbose (bool): Verbosity of the model
dataloader_workers (int): Number of workers for dataset loading
Returns:
list: feature selection scores for each feature.
"""
device, verbose = SETTINGS.get_default(('device', device), ('verbose', verbose))
optim = th.optim.Adam(self.parameters(), lr=lr)
output = th.zeros(self.sizes[0] - 1).to(device)
if batch_size == -1:
batch_size = dataset.__len__()
criterion = MMDloss(input_size=batch_size).to(device)
# Printout value
noise = th.randn(batch_size, 1).to(device)
data_iterator = th.utils.data.DataLoader(dataset, batch_size=batch_size,
shuffle=True, drop_last=True,
num_workers=dataloader_workers)
# TRAIN
with trange(train_epochs + test_epochs, disable=not verbose) as t:
for epoch in t:
for i, (batch, target) in enumerate(data_iterator):
optim.zero_grad()
noise.normal_()
gen = self.layers(th.cat([batch, noise], 1))
# print(gen)
loss = criterion(gen, target) + l1*(self.layers[0].weight.abs().sum() + self.layers[2].weight.abs().sum())
# Train the discriminator
if not epoch % 100 and i == 0:
t.set_postfix(epoch=epoch, loss=loss.item())
if epoch >= train_epochs:
output.add_(self.layers[0].weight.data[:, :-1].sum(dim=0))
loss.backward()
optim.step()
return list(output.div_(test_epochs).div_(dataset.__len__()//batch_size).cpu().numpy())
[docs]class FSGNN(FeatureSelectionModel):
"""Feature Selection using MMD and Generative Neural Networks.
Args:
nh (int): number of hidden units
dropout (float): probability of dropout (between 0 and 1)
activation_function (torch.nn.Module): activation function of the NN
lr (float): learning rate of Adam
l1 (float): L1 penalization coefficient
batch_size (int): batch size, defaults to full-batch
train_epochs (int): number of train epochs
test_epochs (int): number of test epochs
verbose (bool): verbosity (defaults to ``cdt.SETTINGS.verbose``)
nruns (int): number of bootstrap runs
dataloader_workers (int): how many subprocesses to use for data
loading. 0 means that the data will be loaded in the main
process. (default: 0)
Example:
>>> from cdt.independence.graph import FSGNN
>>> from sklearn.datasets import load_boston
>>> boston = load_boston()
>>> df_features = pd.DataFrame(boston['data'])
>>> df_target = pd.DataFrame(boston['target'])
>>> obj = FSGNN()
>>> output = obj.predict_features(df_features, df_target)
>>> ugraph = obj.predict(df_features) # Predict skeleton
"""
def __init__(self, nh=20, dropout=0., activation_function=th.nn.ReLU,
lr=0.01, l1=0.1, batch_size=-1, train_epochs=1000,
test_epochs=1000, verbose=None, nruns=3,
dataloader_workers=0, njobs=None):
"""Init the model."""
super(FSGNN, self).__init__()
self.nh = nh
self.dropout = dropout
self.activation_function = activation_function
self.lr = lr
self.l1 = l1
self.batch_size = batch_size
self.train_epochs = train_epochs
self.test_epochs = test_epochs
self.verbose = SETTINGS.get_default(verbose=verbose)
self.nruns = nruns
self.njobs = SETTINGS.get_default(njobs=njobs)
self.dataloader_workers = dataloader_workers
[docs] def predict_features(self, df_features, df_target, datasetclass=TensorDataset,
device=None, idx=0):
"""For one variable, predict its neighbours.
Args:
df_features (pandas.DataFrame): Features to select
df_target (pandas.Series): Target variable to predict
datasetclass (torch.utils.data.Dataset): Class to override for
custom loading of data.
idx (int): (optional) for printing purposes
device (str): cuda or cpu device (defaults to
``cdt.SETTINGS.default_device``)
Returns:
list: scores of each feature relatively to the target
"""
device = SETTINGS.get_default(device=device)
dataset = datasetclass(th.Tensor(scale(df_features.values)).to(device),
th.Tensor(scale(df_target.values)).to(device))
out = []
for i in range(self.nruns):
model = FSGNN_model([df_features.shape[1] + 1, self.nh, 1],
activation_function=self.activation_function,
dropout=self.dropout).to(device)
out.append(model.train(dataset, lr=0.01, l1=0.1,
batch_size=self.batch_size,
train_epochs=self.train_epochs,
test_epochs=self.test_epochs,
device=device, verbose=self.verbose,
dataloader_workers=self.dataloader_workers))
return list(np.mean(np.array(out), axis=0))
[docs] def predict(self, df_data, threshold=0.05, gpus=None, **kwargs):
"""Predict the skeleton of the graph from raw data.
Returns iteratively the feature selection algorithm on each node.
Args:
df_data (pandas.DataFrame): data to construct a graph from
threshold (float): cutoff value for feature selection scores
kwargs (dict): additional arguments for algorithms
Returns:
networkx.Graph: predicted skeleton of the graph.
"""
njobs = self.njobs
gpus = SETTINGS.get_default(gpu=gpus)
list_nodes = list(df_data.columns.values)
if gpus > 0:
result_feature_selection = parallel_run_generator(self.run_feature_selection,
[([df_data, node], kwargs)
for node in list_nodes],
gpus=gpus, njobs=njobs)
else:
result_feature_selection = [self.run_feature_selection(df_data, node,
idx, **kwargs)
for idx, node in enumerate(list_nodes)]
for idx, i in enumerate(result_feature_selection):
try:
i.insert(idx, 0)
except AttributeError: # if results are numpy arrays
result_feature_selection[idx] = np.insert(i, idx, 0)
matrix_results = np.array(result_feature_selection)
matrix_results *= matrix_results.transpose()
np.fill_diagonal(matrix_results, 0)
matrix_results /= 2
graph = nx.Graph()
for (i, j), x in np.ndenumerate(matrix_results):
if matrix_results[i, j] > threshold:
graph.add_edge(list_nodes[i], list_nodes[j],
weight=matrix_results[i, j])
for node in list_nodes:
if node not in graph.nodes():
graph.add_node(node)
return graph