BNLearner with latent variable

I dont if its supported or not, as bnlearner dont throw an error, maybe its an issue or something that I need to do.

Using default BNLearner to learn the parameters:

from pylab import *
import matplotlib.pyplot as plt
import os

import pyAgrum as gum
import pyAgrum.lib.notebook as gnb
import pandas as pd

# default BN
#
bn=gum.BayesNet()

b,f,w,h = [bn.add(name, 2) for name in 'bfwh']

for link in [(b,f),(b,w),(b,h)]:
    bn.addArc(*link)

bn.cpt('b').fillWith([0.5,0.5])

bn.cpt('w')[{'b':0}]=[0.8,0.2]
bn.cpt('w')[{'b':1}]=[0.3,0.7]

bn.cpt('f')[{'b':0}]=[0.8,0.2]
bn.cpt('f')[{'b':1}]=[0.3,0.7]

bn.cpt('h')[{'b':0}]=[0.8,0.2]
bn.cpt('h')[{'b':1}]=[0.3,0.7]

# sample
size = 1000
generator = gum.BNDatabaseGenerator(bn)
generator.setRandomVarOrder()
generator.drawSamples(size)
data = generator.to_pandas()

# latent variable, all 'b' column missing
data_missing = data.copy()
data_missing['b'] = '?'

# learn with all data
learner = gum.BNLearner(data,bn,['?'])
print(f'missing values: {learner.hasMissingValues()}')

learner.useEM(1e-2)
learner.useSmoothingPrior()
learner.setVerbosity(True)
bn1 = learner.learnParameters(bn)

# learn with missing
learner = gum.BNLearner(data_missing,bn,['?'])
print(f'missing values: {learner.hasMissingValues()}')
print(learner.latentVariables())
learner.useMIIC()
learner.useEM(1e-8)
learner.useSmoothingPrior()
learner.setVerbosity(True)
#print(f"# iterations : {learner.nbrIterations()}")
print(learner)
bn2 = learner.learnParameters(bn.dag())
print(f"# iterations : {learner.nbrIterations()}")

The code above shows that cpt learner from "missing column" (latent) got the wrong results: (1st original BN, 2nd learned BN with all observations, 3rd learned BN with latent 'b')

Code from book "Jensen and Nielsen, Bayesian Network and Decision Graphs 2nd", p.202, for parameter estimation with EM:

bn2=gum.BayesNet()

b,f,w,h = [bn2.add(name, 2) for name in 'bfwh']

for link in [(b,f),(b,w),(b,h)]:
    bn2.addArc(*link)

bn2.cpt('b').fillWith([0.6,0.4])

bn2.cpt('w')[{'b':0}]=[0.6,0.4]
bn2.cpt('w')[{'b':1}]=[0.4,0.6]

bn2.cpt('f')[{'b':0}]=[0.6,0.4]
bn2.cpt('f')[{'b':1}]=[0.4,0.6]
                       
bn2.cpt('h')[{'b':0}]=[0.6,0.4]
bn2.cpt('h')[{'b':1}]=[0.4,0.6]

from collections import namedtuple, deque, Counter 

N = len(data)
ie = gum.LazyPropagation(bn2)

iter_max = 10
kl = [0]*iter_max

def get_posterior(row_data, ie, bn, posterior):
    valid = {e if row_data[e] != '?' else None for e in bn.names()}
    valid.discard(None)
    
    evidence = {e:row_data[e] for e in valid}
    ie.setEvidence(evidence)    
    ie.makeInference()
    
    return ie.jointPosterior(bn.family(posterior))

for k in range(iter_max):
    contador = Counter()
    
    for idx, row in data.iterrows():

        #print(row)
        for n in bn2.names():
            contador[n] += get_posterior(row, ie, bn2, n)

    bn2.cpt('b')[:] = contador['b'][:]
    bn2.cpt('b').normalizeAsCPT()
    
    bn2.cpt('f')[:] = contador['f'][:]
    bn2.cpt('f').normalizeAsCPT()
    
    bn2.cpt('w')[:] = contador['w'][:]
    bn2.cpt('w').normalizeAsCPT()
    
    bn2.cpt('h')[:] = contador['h'][:]
    bn2.cpt('h').normalizeAsCPT()
    
    kl[k] = gum.ExactBNdistance(bn,bn2).compute()['klPQ']

The code above could be optimized, I only used to test. Resuts after 10 runs (got really close): (1st learned BN with all observations, 2nd learned BN with latent 'b')

Edited Nov 29, 2023 by Eduardo Voigt Grando