1
1
"""Topic modeling with latent Dirichlet allocation via MALLET."""
2
2
import logging
3
3
import os
4
- import os .path as op
5
4
import shutil
6
- import subprocess
7
5
8
6
import numpy as np
9
7
import pandas as pd
12
10
from ..base import NiMAREBase
13
11
from ..due import due
14
12
from ..extract import download_mallet , utils
13
+ from ..utils import run_shell_command
15
14
16
15
LGR = logging .getLogger (__name__ )
17
16
@@ -73,12 +72,12 @@ def __init__(
73
72
self , text_df , text_column = "abstract" , n_topics = 50 , n_iters = 1000 , alpha = "auto" , beta = 0.001
74
73
):
75
74
mallet_dir = download_mallet ()
76
- mallet_bin = op .join (mallet_dir , "bin/mallet" )
75
+ mallet_bin = os . path .join (mallet_dir , "bin/mallet" )
77
76
78
77
model_dir = utils ._get_dataset_dir ("mallet_model" )
79
- text_dir = op .join (model_dir , "texts" )
78
+ text_dir = os . path .join (model_dir , "texts" )
80
79
81
- if not op .isdir (model_dir ):
80
+ if not os . path .isdir (model_dir ):
82
81
os .mkdir (model_dir )
83
82
84
83
if alpha == "auto" :
@@ -90,7 +89,7 @@ def __init__(
90
89
self .model_dir = model_dir
91
90
92
91
# Check for presence of text files and convert if necessary
93
- if not op .isdir (text_dir ):
92
+ if not os . path .isdir (text_dir ):
94
93
LGR .info ("Texts folder not found. Creating text files..." )
95
94
os .mkdir (text_dir )
96
95
@@ -104,11 +103,11 @@ def __init__(
104
103
105
104
for id_ in text_df ["id" ].values :
106
105
text = text_df .loc [text_df ["id" ] == id_ , text_column ].values [0 ]
107
- with open (op .join (text_dir , str (id_ ) + ".txt" ), "w" ) as fo :
106
+ with open (os . path .join (text_dir , str (id_ ) + ".txt" ), "w" ) as fo :
108
107
fo .write (text )
109
108
110
109
# Run MALLET topic modeling
111
- LGR .info ("Generating topics ..." )
110
+ LGR .info ("Compiling MALLET commands ..." )
112
111
import_str = (
113
112
f"{ mallet_bin } import-dir "
114
113
f"--input { text_dir } "
@@ -142,8 +141,9 @@ def fit(self):
142
141
p_word_g_topic_ : :obj:`numpy.ndarray`
143
142
Probability of each word given a topic
144
143
"""
145
- subprocess .call (self .commands_ [0 ], shell = True )
146
- subprocess .call (self .commands_ [1 ], shell = True )
144
+ LGR .info ("Generating topics..." )
145
+ run_shell_command (self .commands_ [0 ])
146
+ run_shell_command (self .commands_ [1 ])
147
147
148
148
# Read in and convert doc_topics and topic_keys.
149
149
topic_names = [f"topic_{ i :03d} " for i in range (self .params ["n_topics" ])]
@@ -158,7 +158,7 @@ def fit(self):
158
158
# on an individual id basis by the weights.
159
159
n_cols = (2 * self .params ["n_topics" ]) + 1
160
160
dt_df = pd .read_csv (
161
- op .join (self .model_dir , "doc_topics.txt" ),
161
+ os . path .join (self .model_dir , "doc_topics.txt" ),
162
162
delimiter = "\t " ,
163
163
skiprows = 1 ,
164
164
header = None ,
@@ -194,7 +194,7 @@ def fit(self):
194
194
195
195
# Topic word weights
196
196
p_word_g_topic_df = pd .read_csv (
197
- op .join (self .model_dir , "topic_word_weights.txt" ),
197
+ os . path .join (self .model_dir , "topic_word_weights.txt" ),
198
198
dtype = str ,
199
199
keep_default_na = False ,
200
200
na_values = [],
@@ -213,7 +213,7 @@ def fit(self):
213
213
shutil .rmtree (self .model_dir )
214
214
215
215
def _clean_str (self , string ):
216
- return op . basename (op .splitext (string )[0 ])
216
+ return os . path . basename (os . path .splitext (string )[0 ])
217
217
218
218
def _get_sort (self , lst ):
219
219
return [i [0 ] for i in sorted (enumerate (lst ), key = lambda x : x [1 ])]
0 commit comments