正解データを読んでLRを学習
40
file_yes = codecs.open(‘XXX.txt','r', 'utf-8')
file_no = codecs.open('not_XXX.txt', 'r', 'utf-8')
xs = []
ys = []
for line in file_yes:
line = preprocess(line)
xs.append(make_feature_vector(line))
ys.append(1)
for line in file_no:
line = preprocess(line)
xs.append(make_feature_vector(line))
ys.append(0)
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(xs, ys)
41.
特徴ベクトルの作り方
41
from rules importrules
def make_feature_vector(s):
fv = []
for f in rules:
rs = f(s)
if rs:
fv.append(1)
else:
fv.append(0)
return fv
対話的に学習データを追加
44
unknown = codecs.open(‘SOMETHING.txt','r', 'utf-8')
buf = []
for line in unknown:
s = preprocess(line)
v = make_feature_vector(s)
score = model.predict_proba(v)[0][1]
buf.append((abs(0.5 - score), score, s, line))
buf.sort()
for _dum, score, s, line in buf:
print u"{:.2f}".format(score),
print line.strip()
yn = raw_input('y/n?>')
if yn == 'y':
codecs.open(‘XXX.txt', 'a', 'utf-8').write(line)
elif yn == 'n':
codecs.open('not_XXX.txt', 'a', 'utf-8').write(line)
else:
print 'passed'