Source code for trisbm.sbmtm

"""
This module is cloned from https://github.com/martingerlach/hSBM_Topicmodel/commit/261d870cfc884c4f23ddaa213d07ccbddf348c78


Copyright(C) 2020 martingerlach

This program is free software: you can redistribute it and / or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY
without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see < http: // www.gnu.org/licenses/>.
"""

from __future__ import print_function
import pandas as pd
import numpy as np
import os
import sys
import argparse
from collections import Counter, defaultdict
import pickle
import graph_tool.all as gt
import sys

import scipy
from matplotlib import pyplot as plt


[docs]class sbmtm(): ''' Class for topic-modeling with sbm's. ''' def __init__(self): self.g = None # network self.words = [] # list of word nodes self.documents = [] # list of document nodes self.state = None # inference state from graphtool self.groups = {} # results of group membership from inference self.mdl = np.nan # minimum description length of inferred state self.L = np.nan # number of levels in hierarchy
[docs] def make_graph(self, list_texts, documents=None, counts=True, n_min=None): ''' Load a corpus and generate the word-document network optional arguments: :param documents: list of str, titles of documents :param counts: save edge-multiplicity as counts (default: True) :param n_min: int filter all word-nodes with less than n_min counts (default None) ''' D = len(list_texts) # if there are no document titles, we assign integers 0,...,D-1 # otherwise we use supplied titles if documents is None: list_titles = [str(h) for h in range(D)] else: list_titles = documents # make a graph # create a graph g = gt.Graph(directed=False) # define node properties # name: docs - title, words - 'word' # kind: docs - 0, words - 1 name = g.vp["name"] = g.new_vp("string") kind = g.vp["kind"] = g.new_vp("int") if counts: ecount = g.ep["count"] = g.new_ep("int") docs_add = defaultdict(lambda: g.add_vertex()) words_add = defaultdict(lambda: g.add_vertex()) # add all documents first for i_d in range(D): title = list_titles[i_d] d = docs_add[title] # add all documents and words as nodes # add all tokens as links for i_d in range(D): title = list_titles[i_d] text = list_texts[i_d] d = docs_add[title] name[d] = title kind[d] = 0 c = Counter(text) for word, count in c.items(): w = words_add[word] name[w] = word kind[w] = 1 if counts: e = g.add_edge(d, w) ecount[e] = count else: for n in range(count): g.add_edge(d, w) # filter word-types with less than n_min counts if n_min is not None: v_n = g.new_vertex_property("int") for v in g.vertices(): v_n[v] = v.out_degree() v_filter = g.new_vertex_property("bool") for v in g.vertices(): if v_n[v] < n_min and g.vp['kind'][v] == 1: v_filter[v] = False else: v_filter[v] = True g.set_vertex_filter(v_filter) g.purge_vertices() g.clear_filters() self.g = g self.words = [g.vp['name'][v] for v in g.vertices() if g.vp['kind'][v] == 1] self.documents = [g.vp['name'][v] for v in g.vertices() if g.vp['kind'][v] == 0]
[docs] def make_graph_from_BoW_df(self, df, counts=True, n_min=None): """ Load a graph from a Bag of Words DataFrame :param df: DataFrame should be a DataFrame with where df.index is a list of words and df.columns a list of documents :param counts: save edge-multiplicity as counts (default: True) :param n_min: filter all word-nodes with less than n_min counts (default None) """ # make a graph g = gt.Graph(directed=False) # define node properties # name: docs - title, words - 'word' # kind: docs - 0, words - 1 name = g.vp["name"] = g.new_vp("string") kind = g.vp["kind"] = g.new_vp("int") if counts: ecount = g.ep["count"] = g.new_ep("int") X = df.values # add all documents and words as nodes # add all tokens as links X = scipy.sparse.coo_matrix(X) if not counts and X.dtype != int: X_int = X.astype(int) if not np.allclose(X.data, X_int.data): raise ValueError('Data must be integer if ' 'weighted_edges=False') X = X_int docs_add = defaultdict(lambda: g.add_vertex()) words_add = defaultdict(lambda: g.add_vertex()) D = len(df.columns) # add all documents first for i_d in range(D): title = df.columns[i_d] d = docs_add[title] name[d] = title kind[d] = 0 # add all words for i_d in range(len(df.index)): word = df.index[i_d] w = words_add[word] name[w] = word kind[w] = 1 # add all documents and words as nodes # add all tokens as links for i_d in range(D): title = df.columns[i_d] text = df[title] for i_w, word, count in zip(range(len(df.index)), df.index, text): if count < 1: continue if counts: e = g.add_edge(i_d, D + i_w, add_missing=False) ecount[e] = count else: for n in range(count): g.add_edge(i_d, D + i_w, add_missing=False) # filter word-types with less than n_min counts if n_min is not None: v_n = g.new_vertex_property("int") for v in g.vertices(): v_n[v] = v.out_degree() v_filter = g.new_vertex_property("bool") for v in g.vertices(): if v_n[v] < n_min and g.vp['kind'][v] == 1: v_filter[v] = False else: v_filter[v] = True g.set_vertex_filter(v_filter) g.purge_vertices() g.clear_filters() self.g = g self.words = [g.vp['name'][v] for v in g.vertices() if g.vp['kind'][v] == 1] self.documents = [g.vp['name'][v] for v in g.vertices() if g.vp['kind'][v] == 0] return self
[docs] def save_graph(self, filename='graph.gt.gz'): ''' Save the word-document network generated by make_graph() as filename. Allows for loading the graph without calling make_graph(). ''' self.g.save(filename)
[docs] def load_graph(self, filename='graph.gt.gz'): ''' Load a word-document network generated by make_graph() and saved with save_graph(). ''' self.g = gt.load_graph(filename) self.words = [self.g.vp['name'][v] for v in self.g.vertices() if self.g.vp['kind'][v] == 1] self.documents = [self.g.vp['name'][v] for v in self.g.vertices() if self.g.vp['kind'][v] == 0]
[docs] def dump_model(self, filename="topsbm.pkl"): with open(filename, 'wb') as f: pickle.dump(self, f)
[docs] def load_model(self, filename="topsbm.pkl"): if self.g is not None: del self.g del self.words del self.documents if self.state is not None: del self.state del self.groups del self.mdl del self.L with open(filename, 'rb') as f: self = pickle.load(f)
[docs] def fit( self, overlap=False, hierarchical=True, B_min=2, B_max=None, n_init=1, parallel=False, verbose=False): ''' Fit the sbm to the word-document network. :param overlap: bool (default: False). Overlapping or Non-overlapping groups. Overlapping implemented in fit_overlap :param hierarchical: bool (default: True). Hierarchical SBM or Flat SBM. Flat SBM not implemented yet. :param Bmin: int (default:None): pass an option to the graph-tool inference specifying the minimum number of blocks. :param n_init: int (default:1): number of different initial conditions to run in order to avoid local minimum of MDL. :param parallel: passed to mcmc_sweep If parallel == False each vertex move attempt is made sequentially, where vertices are visited in random order. Otherwise the moves are attempted by sampling vertices randomly, so that the same vertex can be moved more than once, before other vertices had the chance to move. ''' sequential = not parallel g = self.g if g is None: print('No data to fit the SBM. Load some data first (make_graph)') else: if overlap and "count" in g.ep: raise ValueError( "When using overlapping SBMs, the graph must be constructed with 'counts=False'") clabel = g.vp['kind'] state_args = {'clabel': clabel, 'pclabel': clabel} if "count" in g.ep: state_args["eweight"] = g.ep.count state_args["deg_corr"] = True state_args["overlap"] = overlap if B_max is None: B_max = self.g.num_vertices() # the inference mdl = np.inf for i_n_init in range(n_init): state_tmp = gt.minimize_nested_blockmodel_dl( g, state_args=state_args, multilevel_mcmc_args={ "B_min": B_min, "B_max": B_max, "verbose": verbose}, ) mdl_tmp = state_tmp.entropy() if mdl_tmp < mdl: mdl = 1.0 * mdl_tmp state = state_tmp.copy() self.mdl = mdl self.state = state # minimum description length self.mdl = self.state.entropy() # collect group membership for each level in the hierarchy L = len(state.levels) dict_groups_L = {} # only trivial bipartite structure if L == 2: self.L = 1 for l in range(L - 1): dict_groups_l = self.get_groups(l=l) dict_groups_L[l] = dict_groups_l # omit trivial levels: l=L-1 (single group), l=L-2 (bipartite) else: self.L = L - 2 for l in range(L - 2): dict_groups_l = self.get_groups(l=l) dict_groups_L[l] = dict_groups_l self.groups = dict_groups_L
[docs] def fit_overlap( self, n_init=1, hierarchical=True, B_min=20, B_max=160, parallel=True, verbose=True): ''' Fit the sbm to the word-document network. :param hierarchical: bool (default: True). Hierarchical SBM or Flat SBM. Flat SBM not implemented yet. :param Bmin: int (default:20): pass an option to the graph-tool inference specifying the minimum number of blocks. ''' sequential = not parallel g = self.g clabel = g.vp['kind'] state_args = {'clabel': clabel, 'pclabel': clabel} if "count" in g.ep: state_args["eweight"] = g.ep.count self.state = gt.minimize_nested_blockmodel_dl(g, B_min=B_min, B_max=B_max, overlap=True, mcmc_args={ 'sequential': sequential}, mcmc_equilibrate_args={ 'mcmc_args': {'sequential': sequential}}, mcmc_multilevel_args={ 'mcmc_equilibrate_args': { 'mcmc_args': {'sequential': sequential} }, 'anneal_args': { 'mcmc_equilibrate_args': { 'mcmc_args': {'sequential': sequential} } } }, verbose=verbose, nonoverlap_init=False, deg_corr=True) self.mdl = self.state.entropy() L = len(self.state.levels) dict_groups_L = {} if L == 2: self.L = 1 for l in range(L - 1): dict_groups_l = self.get_groups(l=l) dict_groups_L[l] = dict_groups_l # omit trivial levels: l=L-1 (single group), l=L-2 (bipartite) else: self.L = L - 2 for l in range(L - 2): dict_groups_l = self.get_groups(l=l) dict_groups_L[l] = dict_groups_l self.groups = dict_groups_L
def multiflip_mcmc_sweep( self, n_steps=1000, beta=np.inf, niter=10, verbose=True): ''' Fit the sbm to the word-document network. Use multtiplip_mcmc_sweep - n_steps, int (default:1): number of steps. ''' g = self.g if g is None: print('No data to fit the SBM. Load some data first (make_graph)') else: clabel = g.vp['kind'] state_args = {'clabel': clabel, 'pclabel': clabel} if "count" in g.ep: state_args["eweight"] = g.ep.count state = self.state if state is not None: state = state.copy(bs=state.get_bs() + [np.zeros(1)] * 4, sampling=True) else: state = gt.NestedBlockState(g) for step in range(n_steps): # this should be sufficiently large if verbose: print(f"step: {step}") state.multiflip_mcmc_sweep(beta=beta, niter=niter) self.state = state # minimum description length self.mdl = self.state.entropy() # collect group membership for each level in the hierarchy L = len(state.levels) dict_groups_L = {} # only trivial bipartite structure if L == 2: self.L = 1 for l in range(L - 1): dict_groups_l = self.get_groups(l=l) dict_groups_L[l] = dict_groups_l # omit trivial levels: l=L-1 (single group), l=L-2 (bipartite) else: self.L = L - 2 for l in range(L - 2): dict_groups_l = self.get_groups(l=l) dict_groups_L[l] = dict_groups_l self.groups = dict_groups_L
[docs] def multiflip_mcmc_sweep( self, n_steps=1000, beta=np.inf, niter=10, verbose=True): ''' Fit the sbm to the word-document network. Use multtiplip_mcmc_sweep :param n_steps: int (default:1): number of steps. ''' g = self.g if g is None: print('No data to fit the SBM. Load some data first (make_graph)') else: clabel = g.vp['kind'] state_args = {'clabel': clabel, 'pclabel': clabel} if "count" in g.ep: state_args["eweight"] = g.ep.count state = self.state if state is not None: state = state.copy(bs=state.get_bs() + [np.zeros(1)] * 4, sampling=True) else: state = gt.NestedBlockState(g) for step in range(n_steps): # this should be sufficiently large if verbose: print(f"step: {step}") state.multiflip_mcmc_sweep(beta=beta, niter=niter) self.state = state # minimum description length self.mdl = self.state.entropy() # collect group membership for each level in the hierarchy L = len(state.levels) dict_groups_L = {} # only trivial bipartite structure if L == 2: self.L = 1 for l in range(L - 1): dict_groups_l = self.get_groups(l=l) dict_groups_L[l] = dict_groups_l # omit trivial levels: l=L-1 (single group), l=L-2 (bipartite) else: self.L = L - 2 for l in range(L - 2): dict_groups_l = self.get_groups(l=l) dict_groups_L[l] = dict_groups_l self.groups = dict_groups_L
[docs] def plot(self, filename=None, nedges=1000): ''' Plot the graph and group structure. :param filename: str; where to save the plot. if None, will not be saved :param nedges: int; subsample to plot (faster, less memory) ''' self.state.draw(layout='bipartite', output=filename, subsample_edges=nedges, hshortcuts=1, hide=0)
[docs] def print_summary(self, tofile=True): ''' Print hierarchy summary ''' if tofile: orig_stdout = sys.stdout f = open('summary.txt', 'w') sys.stdout = f self.state.print_summary() sys.stdout = orig_stdout f.close() else: self.state.print_summary()
[docs] def topics(self, l=0, n=10): ''' get the n most common words for each word-group in level l. return tuples (word,P(w|tw)) ''' dict_groups = self.get_groups(l) Bw = dict_groups['Bw'] p_w_tw = dict_groups['p_w_tw'] words = self.words # loop over all word-groups dict_group_words = {} for tw in range(Bw): p_w_ = p_w_tw[:, tw] ind_w_ = np.argsort(p_w_)[::-1] list_words_tw = [] for i in ind_w_[:n]: if p_w_[i] > 0: list_words_tw += [(words[i], p_w_[i])] else: break dict_group_words[tw] = list_words_tw return dict_group_words
[docs] def topicdist(self, doc_index, l=0): dict_groups = self.get_groups(l) p_tw_d = dict_groups['p_tw_d'] list_topics_tw = [] for tw, p_tw in enumerate(p_tw_d[:, doc_index]): list_topics_tw += [(tw, p_tw)] return list_topics_tw
[docs] def clusters(self, l=0, n=10): ''' Get n 'most common' documents from each document cluster. most common refers to largest contribution in group membership vector. For the non-overlapping case, each document belongs to one and only one group with prob 1. ''' dict_groups = self.get_groups(l) Bd = dict_groups['Bd'] p_td_d = dict_groups['p_td_d'] docs = self.documents # loop over all word-groups dict_group_docs = {} for td in range(Bd): p_d_ = p_td_d[td, :] ind_d_ = np.argsort(p_d_)[::-1] list_docs_td = [] for i in ind_d_[:n]: if p_d_[i] > 0: list_docs_td += [(docs[i], p_d_[i])] else: break dict_group_docs[td] = list_docs_td return dict_group_docs
[docs] def clusters_query(self, doc_index, l=0): ''' Get all documents in the same group as the query-document. Note: Works only for non-overlapping model. For overlapping case, we need something else. ''' dict_groups = self.get_groups(l) Bd = dict_groups['Bd'] p_td_d = dict_groups['p_td_d'] documents = self.documents # loop over all word-groups dict_group_docs = {} td = np.argmax(p_td_d[:, doc_index]) list_doc_index_sel = np.where(p_td_d[td, :] == 1)[0] list_doc_query = [] for doc_index_sel in list_doc_index_sel: if doc_index != doc_index_sel: list_doc_query += [(doc_index_sel, documents[doc_index_sel])] return list_doc_query
[docs] def group_membership(self, l=0): ''' Return the group-membership vectors for - document-nodes, p_td_d, array with shape Bd x D - word-nodes, p_tw_w, array with shape Bw x V It gives the probability of a nodes belonging to one of the groups. ''' dict_groups = self.get_groups(l) p_tw_w = dict_groups['p_tw_w'] p_td_d = dict_groups['p_td_d'] return p_td_d, p_tw_w
[docs] def print_topics(self, l=0, format='csv', path_save=''): ''' Print topics, topic-distributions, and document clusters for a given level in the hierarchy. :param format: csv (default) or html ''' V = self.get_V() D = self.get_D() # topics dict_topics = self.topics(l=l, n=-1) list_topics = sorted(list(dict_topics.keys())) list_columns = ['Topic %s' % (t + 1) for t in list_topics] T = len(list_topics) df = pd.DataFrame(columns=list_columns, index=range(V)) for t in list_topics: list_w = [h[0] for h in dict_topics[t]] V_t = len(list_w) df.iloc[:V_t, t] = list_w df = df.dropna(how='all', axis=0) if format == 'csv': fname_save = 'topsbm_level_%s_topics.csv' % (l) filename = os.path.join(path_save, fname_save) df.to_csv(filename, index=False, na_rep='') elif format == 'html': fname_save = 'topsbm_level_%s_topics.html' % (l) filename = os.path.join(path_save, fname_save) df.to_html(filename, index=False, na_rep='') elif format == 'tsv': fname_save = 'topsbm_level_%s_topics.tsv' % (l) filename = os.path.join(path_save, fname_save) df.to_csv(filename, index=False, na_rep='', sep='\t') else: pass # topic distributions list_columns = ['i_doc', 'doc'] + \ ['Topic %s' % (t + 1) for t in list_topics] df = pd.DataFrame(columns=list_columns, index=range(D)) for i_doc in range(D): list_topicdist = self.topicdist(i_doc, l=l) df.iloc[i_doc, 0] = i_doc df.iloc[i_doc, 1] = self.documents[i_doc] df.iloc[i_doc, 2:] = [h[1] for h in list_topicdist] df = df.dropna(how='all', axis=1) if format == 'csv': fname_save = 'topsbm_level_%s_topic-dist.csv' % (l) filename = os.path.join(path_save, fname_save) df.to_csv(filename, index=False, na_rep='') elif format == 'html': fname_save = 'topsbm_level_%s_topic-dist.html' % (l) filename = os.path.join(path_save, fname_save) df.to_html(filename, index=False, na_rep='') else: pass # doc-groups dict_clusters = self.clusters(l=l, n=-1) list_clusters = sorted(list(dict_clusters.keys())) list_columns = ['Cluster %s' % (t + 1) for t in list_clusters] T = len(list_clusters) df = pd.DataFrame(columns=list_columns, index=range(D)) for t in list_clusters: list_d = [h[0] for h in dict_clusters[t]] D_t = len(list_d) df.iloc[:D_t, t] = list_d df = df.dropna(how='all', axis=0) if format == 'csv': fname_save = 'topsbm_level_%s_clusters.csv' % (l) filename = os.path.join(path_save, fname_save) df.to_csv(filename, index=False, na_rep='') elif format == 'html': fname_save = 'topsbm_level_%s_clusters.html' % (l) filename = os.path.join(path_save, fname_save) df.to_html(filename, index=False, na_rep='') else: pass # word-distr list_topics = np.arange(len(self.get_groups(l)['p_w_tw'].T)) list_columns = ["Topic %d" % (t + 1) for t in list_topics] pwtw_df = pd.DataFrame(data=self.get_groups( l)['p_w_tw'], index=self.words, columns=list_columns) pwtw_df.replace(0, np.nan) pwtw_df = pwtw_df.dropna(how='all', axis=0) pwtw_df.replace(np.nan, 0) if format == 'csv': fname_save = "topsbm_level_%d_word-dist.csv" % l filename = os.path.join(path_save, fname_save) pwtw_df.to_csv(filename, index=True, header=True, na_rep='') elif format == 'html': fname_save = "topsbm_level_%d_word-dist.html" % l filename = os.path.join(path_save, fname_save) pwtw_df.to_html(filename, index=True, na_rep='') else: pass
########### # HELPER FUNCTIONS ###########
[docs] def get_mdl(self): return self.mdl
# get group-topic statistics
[docs] def get_groups(self, l=0): ''' extract statistics on group membership of nodes form the inferred state. :param B_d: int, number of doc-groups :param B_w: int, number of word-groups :param p_tw_w: array B_w x V; word-group-membership: prob that word-node w belongs to word-group tw: P(tw | w) :param p_td_d: array B_d x D; doc-group membership: prob that doc-node d belongs to doc-group td: P(td | d) :param p_w_tw: array V x B_w; topic distribution: prob of word w given topic tw P(w | tw) :param p_tw_d: array B_w x d; doc-topic mixtures: prob of word-group tw in doc d P(tw | d) :return: dictionary ''' V = self.get_V() D = self.get_D() N = self.get_N() if l in self.groups.keys(): return self.groups[l] g = self.g state = self.state state_l = state.project_level(l).copy(overlap=True) state_l_edges = state_l.get_edge_blocks() # labeled half-edges counts = 'count' in self.g.ep.keys() # count labeled half-edges, group-memberships B = state_l.get_B() # number of half-edges incident on word-node w and labeled as # word-group tw n_wb = np.zeros((V, B)) # number of half-edges incident on document-node d and labeled as # document-group td n_db = np.zeros((D, B)) # number of half-edges incident on document-node d and labeled as # word-group td n_dbw = np.zeros((D, B)) for e in g.edges(): z1, z2 = state_l_edges[e] v1 = e.source() v2 = e.target() if counts: weight = g.ep["count"][e] else: weight = 1 n_db[int(v1), z1] += weight n_dbw[int(v1), z2] += weight n_wb[int(v2) - D, z2] += weight p_w = np.sum(n_wb, axis=1) / float(np.sum(n_wb)) ind_d = np.where(np.sum(n_db, axis=0) > 0)[0] Bd = len(ind_d) n_db = n_db[:, ind_d] ind_w = np.where(np.sum(n_wb, axis=0) > 0)[0] Bw = len(ind_w) n_wb = n_wb[:, ind_w] ind_w2 = np.where(np.sum(n_dbw, axis=0) > 0)[0] n_dbw = n_dbw[:, ind_w2] # group-membership distributions # group membership of each word-node P(t_w | w) p_tw_w = (n_wb / np.sum(n_wb, axis=1)[:, np.newaxis]).T # group membership of each doc-node P(t_d | d) p_td_d = (n_db / np.sum(n_db, axis=1)[:, np.newaxis]).T # topic-distribution for words P(w | t_w) p_w_tw = n_wb / np.sum(n_wb, axis=0)[np.newaxis, :] # Mixture of word-groups into documetns P(t_w | d) p_tw_d = (n_dbw / np.sum(n_dbw, axis=1)[:, np.newaxis]).T result = {} result['Bd'] = Bd result['Bw'] = Bw result['p_tw_w'] = p_tw_w result['p_td_d'] = p_td_d result['p_w_tw'] = p_w_tw result['p_tw_d'] = p_tw_d self.groups[l] = result return result
[docs] def search_consensus(self, force_niter=100000, niter=100): # collect nested partitions bs = [] def collect_partitions(s): bs.append(s.get_bs()) # Now we collect the marginals for exactly niter sweeps gt.mcmc_equilibrate( self.state, force_niter=force_niter, mcmc_args=dict( niter=niter), callback=collect_partitions) # Disambiguate partitions and obtain marginals pmode = gt.PartitionModeState(bs, nested=True, converge=True) pv = pmode.get_marginal(self.g) # Get consensus estimate bs = pmode.get_max_nested() self.state = self.state.copy(bs=bs) return pv
# helper functions
[docs] def get_V(self): ''' :return: number of word-nodes == types ''' return int(np.sum(self.g.vp['kind'].a == 1)) # no. of types
[docs] def get_D(self): ''' :return: number of doc-nodes == number of documents ''' return int(np.sum(self.g.vp['kind'].a == 0)) # no. of types
[docs] def get_N(self): ''' :return: number of edges == tokens ''' return int(self.g.num_edges()) # no. of types
[docs] def group_to_group_mixture(self, l=0, norm=True): V = self.get_V() D = self.get_D() N = self.get_N() g = self.g state = self.state state_l = state.project_level(l).copy(overlap=True) state_l_edges = state_l.get_edge_blocks() # labeled half-edges # count labeled half-edges, group-memberships B = state_l.get_B() n_td_tw = np.zeros((B, B)) counts = 'count' in self.g.ep.keys() for e in g.edges(): z1, z2 = state_l_edges[e] if counts: n_td_tw[z1, z2] += g.ep["count"][e] else: n_td_tw[z1, z2] += 1 ind_d = np.where(np.sum(n_td_tw, axis=1) > 0)[0] Bd = len(ind_d) ind_w = np.where(np.sum(n_td_tw, axis=0) > 0)[0] Bw = len(ind_w) n_td_tw = n_td_tw[:Bd, Bd:] if norm: return n_td_tw / np.sum(n_td_tw) else: return n_td_tw
[docs] def plot_topic_dist(self, l): groups = self.get_groups(l) p_w_tw = groups['p_w_tw'] fig = plt.figure(figsize=(12, 10)) plt.imshow(p_w_tw, origin='lower', aspect='auto', interpolation='none') plt.title(r'Word group membership $P(w | tw)$') plt.xlabel('Topic, tw') plt.ylabel('Word w (index)') plt.colorbar() fig.savefig("p_w_tw_%d.png" % l) p_tw_d = groups['p_tw_d'] fig = plt.figure(figsize=(12, 10)) plt.imshow(p_tw_d, origin='lower', aspect='auto', interpolation='none') plt.title(r'Word group membership $P(tw | d)$') plt.xlabel('Document (index)') plt.ylabel('Topic, tw') plt.colorbar() fig.savefig("p_tw_d_%d.png" % l)
[docs] def save_data(self): for i in range(len(self.state.get_levels()) - 2)[::-1]: print("Saving level %d" % i) self.print_topics(l=i) self.print_topics(l=i, format='tsv') self.plot_topic_dist(i) e = self.state.get_levels()[i].get_matrix() plt.matshow(e.todense()) plt.savefig("mat_%d.png" % i) self.print_summary()