Introduction#

Simulate from a Gaussian copula and use mixed-normal distributions as margins

In [1]: import numpy as np

In [2]: from scipy.stats import multivariate_normal, norm, invwishart

In [3]: from scipy.linalg import toeplitz

In [4]: from scipy.optimize import root_scalar

In [5]: class GaussianMixture:
   ...:     def __init__(self, weights, mus, sigmas):
   ...:         self.weights = weights
   ...:         self.mus = mus
   ...:         self.sigmas = sigmas
   ...:         self.n_components = len(weights)
   ...:     def cdf(self, x):
   ...:         res = np.zeros_like(x)
   ...:         for i_component in range(self.n_components):
   ...:             res += self.weights[i_component] * norm.cdf(x,
   ...:                                                         loc=self.mus[i_component],
   ...:                                                         scale=self.sigmas[i_component])
   ...:         return res
   ...:     def ppf(self, x):
   ...:         bracket = [-10., 10.]
   ...:         if x.min() < self.cdf(bracket[0:1]):
   ...:             bracket[0] -= 50.
   ...:         if x.max() > self.cdf(bracket[1:2]):
   ...:             bracket[1] += 50.
   ...:         res = np.array([root_scalar(lambda yy: self.cdf(np.array([[yy]]).T) - x[i],
   ...:                                     bracket=bracket,
   ...:                                     method='brentq',
   ...:                                     xtol=1e-12, rtol=1e-12).root for i in range(len(x))])
   ...:         return res
   ...: 
In [6]: n_vars = 5

In [7]: margins = [None]*n_vars

In [8]: for i_var in range(n_vars):
   ...:     n_components = 3
   ...:     weights = np.full(n_components, 1./n_components)
   ...:     mus = norm.rvs(size=n_components, scale=2.)
   ...:     sigmas = np.sqrt(invwishart.rvs(df=3, scale=1.0, size=n_components))
   ...:     margins[i_var] = GaussianMixture(weights, mus, sigmas)
   ...: 
In [9]: cov_mat = toeplitz([np.power(0.5, k) for k in range(5)])

In [10]: u_train = norm.cdf(multivariate_normal(mean=np.zeros(5), cov=cov_mat).rvs(1000))

In [11]: x_train = np.full_like(u_train, np.nan)

In [12]: for i_var in range(n_vars):
   ....:     x_train[:, i_var] = margins[i_var].ppf(u_train[:, i_var])
   ....: 
In [13]: u_test = norm.cdf(multivariate_normal(mean=np.zeros(5), cov=cov_mat).rvs(1000))

In [14]: x_test = np.full_like(u_test, np.nan)

In [15]: for i_var in range(n_vars):
   ....:     x_test[:, i_var] = margins[i_var].ppf(u_test[:, i_var])
   ....: 

Gaussian knockoffs#

Estimate a Gaussian knockoff model

In [16]: from vineknockoffs.vine_knockoffs import VineKnockoffs

In [17]: gau_ko = VineKnockoffs()

In [18]: gau_ko.fit_gaussian_knockoffs(x_train=x_train)
Out[18]: <vineknockoffs.vine_knockoffs.VineKnockoffs at 0x7f13502b9a30>

In [19]: gau_ko._dvine.copulas
Out[19]: 
[[GaussianCopula(par=[0.4489001]),
  GaussianCopula(par=[0.48167758]),
  GaussianCopula(par=[0.50485079]),
  GaussianCopula(par=[0.51236845]),
  GaussianCopula(par=[0.08826382]),
  GaussianCopula(par=[0.4489001]),
  GaussianCopula(par=[0.48167758]),
  GaussianCopula(par=[0.50485079]),
  GaussianCopula(par=[0.51236845])],
 [GaussianCopula(par=[0.02854314]),
  GaussianCopula(par=[-0.00969086]),
  GaussianCopula(par=[0.01166561]),
  GaussianCopula(par=[0.14039401]),
  GaussianCopula(par=[0.07991839]),
  GaussianCopula(par=[0.02854314]),
  GaussianCopula(par=[-0.00969086]),
  GaussianCopula(par=[0.01166561])],
 [GaussianCopula(par=[0.06233617]),
  GaussianCopula(par=[-0.01899161]),
  GaussianCopula(par=[0.18216914]),
  GaussianCopula(par=[0.16629839]),
  GaussianCopula(par=[0.24476181]),
  GaussianCopula(par=[0.06233617]),
  GaussianCopula(par=[-0.01899161])],
 [GaussianCopula(par=[0.01043043]),
  GaussianCopula(par=[0.39365313]),
  GaussianCopula(par=[0.39891868]),
  GaussianCopula(par=[0.40050723]),
  GaussianCopula(par=[0.45250961]),
  GaussianCopula(par=[0.01043043])],
 [GaussianCopula(par=[-0.25828746]),
  GaussianCopula(par=[-0.22723834]),
  GaussianCopula(par=[-0.08053081]),
  GaussianCopula(par=[-0.07758852]),
  GaussianCopula(par=[-0.35665741])],
 [GaussianCopula(par=[0.64614131]),
  GaussianCopula(par=[0.54497431]),
  GaussianCopula(par=[0.46927182]),
  GaussianCopula(par=[0.71024124])],
 [GaussianCopula(par=[-0.55794148]),
  GaussianCopula(par=[-0.38227897]),
  GaussianCopula(par=[-0.48864249])],
 [GaussianCopula(par=[0.64844621]), GaussianCopula(par=[0.41442168])],
 [GaussianCopula(par=[-0.99475601])]]

Generate a knockoff copy

In [20]: x_gau_ko = gau_ko.generate(x_test)

Pairwise scatter plots

In [21]: import pandas as pd

In [22]: import seaborn as sns

In [23]: import matplotlib.pyplot as plt

In [24]: sns.set(font_scale=1)

In [25]: sns.pairplot(pd.DataFrame(np.hstack((x_test, x_gau_ko))))
Out[25]: <seaborn.axisgrid.PairGrid at 0x7f1348f55cd0>
_images/gau_ko.png

Estimated correlations

In [26]: sns.set(font_scale=3)

In [27]: sns.heatmap(np.corrcoef(np.hstack((x_test, x_gau_ko)).T), annot=True);
_images/gau_ko_corr.png

Gaussian copula knockoffs#

Estimate a Gaussian copula knockoff model

In [28]: from vineknockoffs.vine_knockoffs import VineKnockoffs

In [29]: gau_cop_ko = VineKnockoffs()

In [30]: gau_cop_ko.fit_gaussian_copula_knockoffs(x_train=x_train)
Out[30]: <vineknockoffs.vine_knockoffs.VineKnockoffs at 0x7f1347498760>

In [31]: gau_cop_ko._dvine.copulas
Out[31]: 
[[GaussianCopula(par=[0.49889927]),
  GaussianCopula(par=[0.51485352]),
  GaussianCopula(par=[0.51563259]),
  GaussianCopula(par=[0.51825192]),
  GaussianCopula(par=[0.08920866]),
  GaussianCopula(par=[0.49889927]),
  GaussianCopula(par=[0.51485352]),
  GaussianCopula(par=[0.51563259]),
  GaussianCopula(par=[0.51825192])],
 [GaussianCopula(par=[-0.00416141]),
  GaussianCopula(par=[-0.01605925]),
  GaussianCopula(par=[0.00478767]),
  GaussianCopula(par=[0.15206381]),
  GaussianCopula(par=[0.09216385]),
  GaussianCopula(par=[-0.00416141]),
  GaussianCopula(par=[-0.01605925]),
  GaussianCopula(par=[0.00478767])],
 [GaussianCopula(par=[0.06845161]),
  GaussianCopula(par=[-0.01260832]),
  GaussianCopula(par=[0.19342715]),
  GaussianCopula(par=[0.17230366]),
  GaussianCopula(par=[0.24343657]),
  GaussianCopula(par=[0.06845161]),
  GaussianCopula(par=[-0.01260832])],
 [GaussianCopula(par=[0.00267909]),
  GaussianCopula(par=[0.44566922]),
  GaussianCopula(par=[0.43046756]),
  GaussianCopula(par=[0.4090201]),
  GaussianCopula(par=[0.45793907]),
  GaussianCopula(par=[0.00267909])],
 [GaussianCopula(par=[-0.33754818]),
  GaussianCopula(par=[-0.00999287]),
  GaussianCopula(par=[-0.11597554]),
  GaussianCopula(par=[-0.0109706]),
  GaussianCopula(par=[-0.36733475])],
 [GaussianCopula(par=[0.63963203]),
  GaussianCopula(par=[0.48851243]),
  GaussianCopula(par=[0.46462707]),
  GaussianCopula(par=[0.68064834])],
 [GaussianCopula(par=[-0.5505026]),
  GaussianCopula(par=[-0.33889836]),
  GaussianCopula(par=[-0.46158122])],
 [GaussianCopula(par=[0.63887789]), GaussianCopula(par=[0.37102548])],
 [GaussianCopula(par=[-0.99548401])]]

Generate a knockoff copy

In [32]: x_gau_cop_ko = gau_cop_ko.generate(x_test)

Pairwise scatter plots

In [33]: import pandas as pd

In [34]: import seaborn as sns

In [35]: import matplotlib.pyplot as plt

In [36]: sns.set(font_scale=1)

In [37]: sns.pairplot(pd.DataFrame(np.hstack((x_test, x_gau_cop_ko))))
Out[37]: <seaborn.axisgrid.PairGrid at 0x7f1347b128e0>
_images/gau_cop_ko.png

Estimated correlations

In [38]: sns.set(font_scale=3)

In [39]: sns.heatmap(np.corrcoef(np.hstack((x_test, x_gau_cop_ko)).T), annot=True);
_images/gau_cop_ko_corr.png

Vine copula knockoffs#

Estimate a Gaussian copula knockoff model

In [40]: from vineknockoffs.vine_knockoffs import VineKnockoffs

In [41]: vine_ko = VineKnockoffs()

In [42]: vine_ko.fit_vine_copula_knockoffs(x_train=x_train)
Out[42]: <vineknockoffs.vine_knockoffs.VineKnockoffs at 0x7f13447aa490>

In [43]: vine_ko._dvine.copulas
Out[43]: 
[[GaussianCopula(par=[0.48383016]),
  GaussianCopula(par=[0.51726826]),
  GaussianCopula(par=[0.27181872]),
  GaussianCopula(par=[0.51290159]),
  GaussianCopula(par=[0.16647727]),
  GaussianCopula(par=[0.48383016]),
  GaussianCopula(par=[0.51726826]),
  GaussianCopula(par=[0.27181872]),
  GaussianCopula(par=[0.51290159])],
 [IndepCopula(),
  IndepCopula(),
  GaussianCopula(par=[0.45121434]),
  IndepCopula(),
  GaussianCopula(par=[0.19896423]),
  IndepCopula(),
  IndepCopula(),
  GaussianCopula(par=[0.45121434])],
 [IndepCopula(),
  IndepCopula(),
  ClaytonCopula(par=[0.22514369], rotation=180),
  IndepCopula(),
  GaussianCopula(par=[0.46917166]),
  IndepCopula(),
  IndepCopula()],
 [IndepCopula(),
  GaussianCopula(par=[0.44616686]),
  GaussianCopula(par=[0.43643683]),
  IndepCopula(),
  GaussianCopula(par=[0.45584443]),
  IndepCopula()],
 [GaussianCopula(par=[-0.33754818]),
  GaussianCopula(par=[-0.00999287]),
  GaussianCopula(par=[-0.11597554]),
  GaussianCopula(par=[-0.36733475]),
  GaussianCopula(par=[-0.0109706])],
 [GaussianCopula(par=[0.63963203]),
  GaussianCopula(par=[0.48851243]),
  GaussianCopula(par=[0.01680553]),
  GaussianCopula(par=[0.68064834])],
 [GaussianCopula(par=[-0.5505026]),
  GaussianCopula(par=[-0.03065251]),
  GaussianCopula(par=[0.61868149])],
 [GaussianCopula(par=[0.0467642]), GaussianCopula(par=[-0.48578146])],
 [GaussianCopula(par=[0.99732389])]]

Generate a knockoff copy

In [44]: x_vine_ko = vine_ko.generate(x_test)

Pairwise scatter plots

In [45]: import pandas as pd

In [46]: import seaborn as sns

In [47]: import matplotlib.pyplot as plt

In [48]: sns.set(font_scale=1)

In [49]: sns.pairplot(pd.DataFrame(np.hstack((x_test, x_vine_ko))))
Out[49]: <seaborn.axisgrid.PairGrid at 0x7f1344bc5df0>
_images/vine_ko.png

Estimated correlations

In [50]: sns.set(font_scale=3)

In [51]: sns.heatmap(np.corrcoef(np.hstack((x_test, x_vine_ko)).T), annot=True);
_images/vine_ko_corr.png