针对结巴分词Memory Error的两种解决方式

一、背景

​ 最近,在使用Gensim Word2vec根据特定语料训练近义词模型,模型训练输入语料要求是分词之后的文件。使用结巴jieba对原始语料文件进行分词,在分词过程中,由于语料文件太大,将近五千万的数据量,出现了Memory Error问题。针对此问题,提供以下两种解决方式。同时,代码中展示了分词时对词语词性的筛选,停用词及标点符号的过滤。最后,附上根据分词文件进行模型训练代码。

二、解决方式

​ 解决思路:一是在读取文件数据时避免一次性全部加载数据,单线程按行加载处理数据;二是将存储有大数据量的一个文件拆分为多个,多线程并行分词。

2.1 第一种按行加载处理数据的解决方案代码

# -*- coding: utf-8 -*-
"""
由原始文本进行分词后保存到新的文件
"""
import jieba
import numpy as np
import jieba.posseg as pseg
import re

filePath='/data/work/keyword/work_data/work_title_description.csv'
fileSegWordDonePath ='/data/work/keyword/work_cutdata/corpus_line.txt'

#停用词加载
stop_word_path = '/data/work/keyword/keyword_extraction-master/data/stopWord.txt'
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'rb').readlines()]
    return stopwords
    
# 打印中文列表
def PrintListChinese(list):
    for i in range(len(list)):
        print (list[i])
        
# 读取文件内容到列表
fileTrainRead = []
with open(filePath,'r') as fileTrainRaw:
    for line in fileTrainRaw:  # 按行读取文件
        fileTrainRead.append(line)
    
# jieba分词后保存在列表中
fileTrainSeg=[]
jieba.enable_paddle() 
stopwords = stopwordslist(stop_word_path)  # 这里加载停用词的路径
outstr = ''
for i in range(len(fileTrainRead)):
    for x in pseg.cut(fileTrainRead[i][0:],use_paddle=True):
        #下方判断表示选取指定词性词语
        if x.flag == 'n' or x.flag == 'nw' or x.flag == 'nz' or x.flag.startswith('TIME') or x.flag.startswith('t'):
            if x.word not in stopwords:
                #去除标点符号
                y = re.sub(r"[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+", " ", x.word)
                if y != '\t':
                    outstr += y 
                    outstr += " " 
    if i % 100 == 0:
        print(i)                    
fileTrainSeg.append([outstr])

# 保存分词结果到文件中
with open(fileSegWordDonePath,'w',encoding='utf-8') as fW:
    for i in range(len(fileTrainSeg)):
        fW.write(fileTrainSeg[i][0])
        fW.write('\n')
      
## 2.2 第二种将存储有大数据量的一个文件拆分为多个的解决方案代码
    # -*-coding:utf-8 -*-
    import jieba.analyse
    import jieba
    import os
    import jieba.posseg as pseg
    
    jieba.enable_parallel(4)
    raw_data_path = '/data/work/keyword/work_data/'
    cut_data_path = '/data/work/keyword/work_cutdata/'
    stop_word_path = '/data/work/keyword/keyword_extraction-master/data/stopWord.txt'
    def stopwordslist(filepath):
        stopwords = [line.strip() for line in open(filepath, 'rb').readlines()]
        return stopwords
    def cut_word(raw_data_path, cut_data_path ):
        #读取该路径下的多个数据文件
        data_file_list = os.listdir(raw_data_path)
        corpus = ''
        temp = 0
        for file in data_file_list:
            with open(raw_data_path + file,'rb') as f:
                print(temp+1)
                temp +=1
                document = f.read()
                document_cut = jieba.cut(document, cut_all=False)
                result = ' '.join(document_cut)
                corpus += result
        with open(cut_data_path + 'corpus.txt', 'w+', encoding='utf-8') as f:
            f.write(corpus)  # 读取的方式和写入的方式要一致
        stopwords = stopwordslist(stop_word_path)  # 加载停用词的路径
        with open(cut_data_path + 'corpus.txt', 'r', encoding='utf-8') as f:
            document_cut = f.read()
            outstr = ''
            for word in document_cut:
                if word not in stopwords:
                    if word != '\t':
                        outstr += word
                        outstr += " "
        with open(cut_data_path + 'corpus1.txt', 'w+', encoding='utf-8') as f:
                f.write(outstr)  # 读取的方式和写入的方式要一致
    if __name__ == "__main__":
        cut_word(raw_data_path, cut_data_path )

三、使用Gensim Word2vec训练模型

"""
gensim word2vec获取词向量
"""
import warnings
import logging
import os.path
import sys
import multiprocessing
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

# 忽略警告
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
if __name__ == '__main__':
    program = os.path.basename(sys.argv[0]) # 读取当前文件的文件名
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s',level=logging.INFO)
    logger.info("running %s" % ' '.join(sys.argv))
    # inp为输入语料, outp1为输出模型, outp2为vector格式的模型
    inp = '/data/work/keyword/work_cutdata/corpus_line.txt'
    out_model = '/data/work/keyword/word2vec_model/work_title_description.model'
    out_vector = '/data/work/keyword/word2vec_model/work_title_description.vector'
    # 训练skip-gram模型
    model = Word2Vec(LineSentence(inp), size=50, window=5, min_count=5,
                     workers=multiprocessing.cpu_count())
    # 保存模型
    model.save(out_model)
    # 保存词向量
    model.wv.save_word2vec_format(out_vector, binary=False)

四、总结

​ 在开发过程中,最终使用第一种按行读取文件数据的方式进行分词并训练得到模型。第二种方式读取的是目录下的多个文件,测试时分了20个文件分别读取,Memory Error问题不再出现。

参考:

https://blog.csdn.net/lilong1...;

https://blog.csdn.net/qq_3527...

作者:易企秀工程师 Emma


易企秀工程师
91 声望10 粉丝