使用jieba库进行中文分词

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# coding=utf-8
import xlwt # 写入excel库
import jieba.analyse
import jieba
import xlrd
import numpy as np

# 将文本编码转换为utf-8格式
import sys
reload(sys)
sys.setdefaultencoding('utf-8')


# excel转变为文本
def excel2txt(file_path):
data = xlrd.open_workbook(
file_path, encoding_override='utf-8') # 打开Excel文件读取数据
sh = data.sheet_by_name("Sheet1") # 通过工作簿名称获取
n = 0
i = 0
file = open("temp.txt", "w")
for n in range(sh.nrows):
for i in range(sh.ncols):
text = sh.cell_value(n, i).encode('utf-8')
file.write(text)
file.write('\n')

# 结巴词频分析
def jieba_analyse():
if __name__ == "__main__":
wbk = xlwt.Workbook(encoding='ascii')
sheet = wbk.add_sheet("wordCount") # Excel单元格名字
word_lst = []
key_list = []
for line in open('temp.txt'): # 1.txt是需要分词统计的文档

item = line.strip('\n\r').split('\t') # 制表格切分
# print item
tags = jieba.analyse.extract_tags(item[0]) # jieba分词
for t in tags:
word_lst.append(t)

word_dict = {}
with open("wordCount.txt", 'w') as wf2: # 打开文件

for item in word_lst:
if item not in word_dict: # 统计数量
word_dict[item] = 1
else:
word_dict[item] += 1

orderList = list(word_dict.values())
orderList.sort(reverse=True)
# print orderList
for i in range(len(orderList)):
for key in word_dict:
if word_dict[key] == orderList[i]:
# 写入txt文档
wf2.write(key + ' ' + str(word_dict[key]) + '\n')
key_list.append(key)
word_dict[key] = 0

for i in range(len(key_list)):
sheet.write(i, 1, label=orderList[i])
sheet.write(i, 0, label=key_list[i])
wbk.save('wordCount.xls') # 保存为 wordCount.xls文件

# 要分析的xlsx表
def analyse_word(file_path):
excel2txt(file_path)
jieba_analyse()


analyse_word('data2.xlsx')
0%