BNLearner with latent variable
I dont if its supported or not, as bnlearner dont throw an error, maybe its an issue or something that I need to do.
- Using default BNLearner to learn the parameters:
from pylab import *
import matplotlib.pyplot as plt
import os
import pyAgrum as gum
import pyAgrum.lib.notebook as gnb
import pandas as pd
# default BN
#
bn=gum.BayesNet()
b,f,w,h = [bn.add(name, 2) for name in 'bfwh']
for link in [(b,f),(b,w),(b,h)]:
bn.addArc(*link)
bn.cpt('b').fillWith([0.5,0.5])
bn.cpt('w')[{'b':0}]=[0.8,0.2]
bn.cpt('w')[{'b':1}]=[0.3,0.7]
bn.cpt('f')[{'b':0}]=[0.8,0.2]
bn.cpt('f')[{'b':1}]=[0.3,0.7]
bn.cpt('h')[{'b':0}]=[0.8,0.2]
bn.cpt('h')[{'b':1}]=[0.3,0.7]
# sample
size = 1000
generator = gum.BNDatabaseGenerator(bn)
generator.setRandomVarOrder()
generator.drawSamples(size)
data = generator.to_pandas()
# latent variable, all 'b' column missing
data_missing = data.copy()
data_missing['b'] = '?'
# learn with all data
learner = gum.BNLearner(data,bn,['?'])
print(f'missing values: {learner.hasMissingValues()}')
learner.useEM(1e-2)
learner.useSmoothingPrior()
learner.setVerbosity(True)
bn1 = learner.learnParameters(bn)
# learn with missing
learner = gum.BNLearner(data_missing,bn,['?'])
print(f'missing values: {learner.hasMissingValues()}')
print(learner.latentVariables())
learner.useMIIC()
learner.useEM(1e-8)
learner.useSmoothingPrior()
learner.setVerbosity(True)
#print(f"# iterations : {learner.nbrIterations()}")
print(learner)
bn2 = learner.learnParameters(bn.dag())
print(f"# iterations : {learner.nbrIterations()}")
The code above shows that cpt learner from "missing column" (latent) got the wrong results: (1st original BN, 2nd learned BN with all observations, 3rd learned BN with latent 'b')
- Code from book "Jensen and Nielsen, Bayesian Network and Decision Graphs 2nd", p.202, for parameter estimation with EM:
bn2=gum.BayesNet()
b,f,w,h = [bn2.add(name, 2) for name in 'bfwh']
for link in [(b,f),(b,w),(b,h)]:
bn2.addArc(*link)
bn2.cpt('b').fillWith([0.6,0.4])
bn2.cpt('w')[{'b':0}]=[0.6,0.4]
bn2.cpt('w')[{'b':1}]=[0.4,0.6]
bn2.cpt('f')[{'b':0}]=[0.6,0.4]
bn2.cpt('f')[{'b':1}]=[0.4,0.6]
bn2.cpt('h')[{'b':0}]=[0.6,0.4]
bn2.cpt('h')[{'b':1}]=[0.4,0.6]
from collections import namedtuple, deque, Counter
N = len(data)
ie = gum.LazyPropagation(bn2)
iter_max = 10
kl = [0]*iter_max
def get_posterior(row_data, ie, bn, posterior):
valid = {e if row_data[e] != '?' else None for e in bn.names()}
valid.discard(None)
evidence = {e:row_data[e] for e in valid}
ie.setEvidence(evidence)
ie.makeInference()
return ie.jointPosterior(bn.family(posterior))
for k in range(iter_max):
contador = Counter()
for idx, row in data.iterrows():
#print(row)
for n in bn2.names():
contador[n] += get_posterior(row, ie, bn2, n)
bn2.cpt('b')[:] = contador['b'][:]
bn2.cpt('b').normalizeAsCPT()
bn2.cpt('f')[:] = contador['f'][:]
bn2.cpt('f').normalizeAsCPT()
bn2.cpt('w')[:] = contador['w'][:]
bn2.cpt('w').normalizeAsCPT()
bn2.cpt('h')[:] = contador['h'][:]
bn2.cpt('h').normalizeAsCPT()
kl[k] = gum.ExactBNdistance(bn,bn2).compute()['klPQ']
The code above could be optimized, I only used to test. Resuts after 10 runs (got really close): (1st learned BN with all observations, 2nd learned BN with latent 'b')
Edited by Eduardo Voigt Grando