Published Date : 2019年5月28日7:41
# JSONファイルを読み込むためにインポート import json # Sudachiぃを使えるようにインポート from sudachipy import config from sudachipy import dictionary from sudachipy import tokenizer # 必要な設定ファイルを読み込む with open(config.SETTINGFILE, 'r', encoding='utf-8') as f: settings = json.load(f) # 形態素をするためのオブジェクトを作成 tokenizer_obj = dictionary.Dictionary(settings).create() # 台本の読み込み。 # おそらくダイレクトリの場所は ./src/sudachipy の中なので、 # 2つ前の(../../)ダイレクトリを指定。 with open('../../daihon.json','r',encoding='utf-8') as f: daihon=json.load(f) # タイトルを抜き出す。 titles=[d['title'] for d in daihon] # できるだけ、長い名詞をつなげる。 mode = tokenizer.Tokenizer.SplitMode.C # Sudachiぃで分かち書きをする関数。 def parse_text_by_line(text,mode): tokens_list=[] for line in text: line=line.replace('Boke: ','').replace('mannaka: ','').replace('Tukkomi: ','') tokens=tokenizer_obj.tokenize(mode,line) tokens_list.append([token.surface() for token in tokens]) return tokens_list # タイトルごと、一行ずつ分かち書きリストにする。すごく時間がかかる。 sudachi_daihon=[parse_text_by_line(d['article'],mode) for d in daihon] # 時間がかかった分、愛着をもってピックル。 import dill with open('../../sudachi_daihon.pkl','wb') as f: dill.dump(sudachi_daihon,f) # 一行ずつのリストにする。 lines=[' '.join(s) for sd in sudachi_daihon for s in sd] # あとでgensimで取り扱えるようにするためテキストファイルにする。 for line in lines: with open('../../lines.txt','a') as f: f.write(line+'\n')
# ライブラリインポート import numpy as np import pandas as pd from gensim.models.doc2vec import Doc2Vec from gensim.models.deprecated.doc2vec import TaggedLineDocument # 一応の流れ。 sentence_model=TaggedLineDocument('../../lines.txt') # model = Doc2Vec(alpha=0.025, min_alpha=0.025,dm=1,min_count=10) # model.build_vocab(sentence_model) # 簡単短縮 model = Doc2Vec(corpus_file='../../lines.txt',dm=1,vector_size=300,window=20,min_count=10,workers=6) # トレーニング開始。 model.train(sentence_model,epochs=20,total_examples=len(lines)) # セーーーブ!!! model.save('sentence_model.model') # 試してみる radom_choice=np.random.choice(len(lines)) for i,m in enumerate(model.docvecs.most_similar(radom_choice)): print('*********************************************************************************') print(f'original line: {lines[radom_choice]} and indices is : {radom_choice}') print(f'similar top {i+1} is : {lines[m[0]]} and indices is : {m[0]} and similar is : {m[1]}') print('*********************************************************************************') print('*********************************************************************************') if radom_choice!=0 or random_choice!=len(lines): print(f'original line before: {lines[radom_choice-1]} and indices is : {radom_choice-1}') print() print(f'original line after: {lines[radom_choice+1]} and indices is : {radom_choice+1}') print('\nNext\n')
# ボケと(トリオの場合にはmannakaがいる)ツッコミのセリフを分ける。 import re boke_lines=[] for d in daihon: for line in d['article']: try: boke_lines.append(re.search(r'Boke:[^\n]+',line).group().replace('Boke: ','')) except: pass mannaka_lines=[] for d in daihon: for line in d['article']: try: mannaka_lines.append(re.search(r'mannaka:[^\n]+',line).group().replace('mannaka: ','')) except: pass tukkomi_lines=[] for d in daihon: for line in d['article']: try: tukkomi_lines.append(re.search(r'Tukkomi:[^\n]+',line).group().replace('Tukkomi: ','')) except: pass # テキストファイルに保存。 for boke in boke_lines: with open('../../boke_lines.txt','a') as f: f.write(boke+'\n') for mannaka in mannaka_lines: with open('../../mannaka_lines.txt','a') as f: f.write(mannaka+'\n') for tukkomi in tukkomi_lines: with open('../../tukkomi_lines.txt','a') as f: f.write(tukkomi+'\n') # それぞれDoc2Vecを使い学習。 boke_sentence=TaggedLineDocument('../../boke_lines.txt') mannaka_sentence=TaggedLineDocument('../../mannaka_lines.txt') tukkomi_sentence=TaggedLineDocument('../../tukkomi_lines.txt') boke_model = Doc2Vec(corpus_file='../../boke_lines.txt',dm=1,vector_size=300,window=20,min_count=10,workers=6) mannaka_model = Doc2Vec(corpus_file='../../mannaka_lines.txt',dm=1,vector_size=300,window=20,min_count=10,workers=6) tukkomi_model = Doc2Vec(corpus_file='../../tukkomi_lines.txt',dm=1,vector_size=300,window=20,min_count=10,workers=6) boke_model.save('../../boke_model.model') mannaka_model.save('../../mannaka_model.model') tukkomi_model.save('../../tukkomi_model.model') # 試してみる radom_choice=np.random.choice(len(tukkomi_lines)) for i,m in enumerate(tukkomi_model.docvecs.most_similar(radom_choice)): print('*********************************************************************************') print(f'original line: {tukkomi_lines[radom_choice]} and indices is : {radom_choice}') print(f'similar top {i+1} is : {tukkomi_lines[m[0]]} and indices is : {m[0]} and similar is : {m[1]}') print('*********************************************************************************') print('*********************************************************************************') if radom_choice!=0 or random_choice!=len(tukkomi_lines): print(f'original line before: {tukkomi_lines[radom_choice-1]} and indices is : {radom_choice-1}') print() print(f'original line after: {tukkomi_lines[radom_choice+1]} and indices is : {radom_choice+1}') print('\nNext\n')
# さらに遊ぶ random_choice_tukkomi=np.random.choice(len(tukkomi_lines)) random_choice_boke=np.random.choice(len(boke_lines)) tukkomi_line=tukkomi_lines[random_choice_tukkomi] boke_line=boke_lines[random_choice_boke] for i,m in enumerate(tukkomi_model.docvecs.most_similar(random_choice_tukkomi)): if i==0: print(tukkomi_lines[m[0]]) for i,m in enumerate(boke_model.docvecs.most_similar(random_choice_boke)): if i==0: print(boke_lines[m[0]]) print() print(tukkomi_line) print(boke_line) print(tukkomi_lines[random_choice_tukkomi+1])