Home 2.Corpora and Vector Spaces
Post
Cancel

2.Corpora and Vector Spaces

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import logging


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


# 加载语料库
documents =[line.split('$$$')[0] for line in open('../data/corpus.txt','r').readlines()]
print(documents[:5])

# 移除停用词
stopwords='啊呢吗的'
texts=[[word for word in document if word not in stopwords]  for document in documents]
print(texts[:5])

# 构造词频
from collections import defaultdict
freq=defaultdict(int)
for text in texts:
    for word in text:
        freq[word]+=1
# 移除出现次数少于1次的词
texts=[ [word for word in text if freq[word]>1] for text in texts]
print(texts[:5])

# 构造词典
from gensim import corpora
dictionary=corpora.Dictionary(texts)
print(dictionary)
print(dictionary.token2id)

# 本地持久化
dictionary.save('./model/dictionary.m')

# 文档向量化
new_text='今天天气比较好,适合出去旅游'
new_vec=dictionary.doc2bow(list(new_text))
print(new_vec)


# 文档序列化
corpus=[dictionary.doc2bow(text) for text in texts]

corpora.MmCorpus.serialize('./model/corpra.mms',corpus)
corpus = corpora.MmCorpus('/tmp/corpus.mms')

corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)
corpus = corpora.SvmLightCorpus('/tmp/corpus.svmlight')

corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)
corpus = corpora.BleiCorpus('/tmp/corpus.lda-c')

corpora.LowCorpus.serialize('/tmp/corpus.low', corpus)
corpus = corpora.LowCorpus('/tmp/corpus.low')

# memory friendly 1
class MyCorpus:
    def __iter__(self):
        for line in open('../data/corpus.txt','r'):
            yield dictionary.doc2bow(list(line))
corpus_friendly=MyCorpus()

# memory friendly 2
from six import iteritems
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]#document frequencies: tokenId -> in how many documents this token appeared

#corpus compacty
dictionary.filter_tokens(once_ids)#remove id=1
dictionary.compactify()#remove gaps between id
This post is licensed under CC BY 4.0 by the author.