词云图,也叫文字云,是对文本中出现频率较高的“关键词”予以视觉化的展现,词云图过滤掉大量的低频低质的文本信息,使得浏览者只要一眼扫过文本就可领略文本的主旨。
安装过程中会出现很多问题,通过pip安装时,如果出现错误,看看报的什么错误,如果在下载那个包的过程中出现问题,可以通过python包主页搜索那个包下载进行安装
- #安装词云
- pip install wordcloud
-
- #安装jieba分词
- pip install jieba
-
方法2:下载.whl文件http://www.lfd.uci.edu/~gohlke/pythonlibs/#wordcloud
使用cd命令进入whl文件的路径
运行这条命令:
- python -m pip install <filename>
-
- #导入python画图的库,词云生成库和jieba的分词库
- import matplotlib.pyplot as plt
- from wordcloud import WordCloud
- import jieba
-
- #读取txt格式的文本内容
- text_from_file_with_apath = open('JsIndex.txt').read()
-
- #使用jieba进行分词,并对分词的结果以空格隔开
- wordlist_after_jieba = jieba.cut(text_from_file_with_apath, cut_all = True)
- wl_space_split = " ".join(wordlist_after_jieba)
-
- #对分词后的文本生成词云
- my_wordcloud = WordCloud().generate(wl_space_split)
-
- #用pyplot展示词云图。
- plt.imshow(my_wordcloud)
- plt.axis("off")
- plt.show()
-
入门可以参考文章python词云 wordcloud 入门,安装的时候建议使用依赖包安装,我用命令安装了几次一直超时失败。
- #-*- coding:utf-8 -*-
- import urllib,urllib2,re
- from lxml import etree
-
- class CrawlJs():
- #定义函数,爬取对应的数据
- def getArticle(self,url):
- print '█████████████◣开始爬取数据'
- my_headers = {
- 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.104 Safari/537.36',
- }
- request = urllib2.Request(url,headers=my_headers)
- content = urllib2.urlopen(request).read()
- return content
-
- #定义函数,筛选和保存爬取到的数据
- def save(self,content):
- xml = etree.HTML(content)
- datas = xml.xpath('//div[@class="content"]/a/text()')
- print datas
- for data in datas:
- print data
- with open('JsIndex.txt','a+') as f:
- f.write(data.encode('utf-8')+ '\n')
- print '█████████████◣爬取完成!'
-
- #定义主程序接口
- if __name__ == '__main__':
- url = 'http://www.jianshu.com/'
- js = CrawlJs()
- content = js.getArticle(url)
- js.save(content)
-
- from os import path
- from PIL import Image
- import numpy as np
- import matplotlib.pyplot as plt
-
- from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
-
- d = path.dirname(__file__)
-
- # Read the whole text.
- text = open(path.join(d, 'alice.txt')).read()
-
- # read the mask / color image taken from
- # http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
- alice_coloring = np.array(Image.open(path.join(d, "alice_color.png")))
- stopwords = set(STOPWORDS)
- stopwords.add("said")
-
- wc = WordCloud(background_color="white", max_words=2000, mask=alice_coloring,
- stopwords=stopwords, max_font_size=40, random_state=42)
- # generate word cloud
- wc.generate(text)
-
- # create coloring from image
- image_colors = ImageColorGenerator(alice_coloring)
-
- # show
- plt.imshow(wc, interpolation="bilinear")
- plt.axis("off")
- plt.figure()
- # recolor wordcloud and show
- # we could also give color_func=image_colors directly in the constructor
- plt.imshow(wc.recolor(color_func=image_colors), interpolation="bilinear")
- plt.axis("off")
- plt.figure()
- plt.imshow(alice_coloring, cmap=plt.cm.gray, interpolation="bilinear")
- plt.axis("off")
- plt.show()
-
- from os import path
- from scipy.misc import imread
- import matplotlib.pyplot as plt
-
- from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
-
- # 获取当前文件路径
- # __file__ 为当前文件, 在ide中运行此行会报错,可改为
- # d = path.dirname('.')
- d = path.dirname(__file__)
-
- # 读取文本 alice.txt 在包文件的example目录下
- #内容为
- """
- Project Gutenberg's Alice's Adventures in Wonderland, by Lewis Carroll
-
- This eBook is for the use of anyone anywhere at no cost and with
- almost no restrictions whatsoever. You may copy it, give it away or
- re-use it under the terms of the Project Gutenberg License included
- with this eBook or online at www.gutenberg.org
- """
- text = open(path.join(d, 'alice.txt')).read()
-
- # read the mask / color image
- # taken from http://jirkavinse.deviantart.com/art/quot-Real-Life-quot-Alice-282261010
- # 设置背景图片
- alice_coloring = imread(path.join(d, "alice_color.png"))
-
- wc = WordCloud(background_color="white", #背景颜色max_words=2000,# 词云显示的最大词数
- mask=alice_coloring,#设置背景图片
- stopwords=STOPWORDS.add("said"),
- max_font_size=40, #字体最大值
- random_state=42)
- # 生成词云, 可以用generate输入全部文本(中文不好分词),也可以我们计算好词频后使用generate_from_frequencies函数
- wc.generate(text)
- # wc.generate_from_frequencies(txt_freq)
- # txt_freq例子为[('词a', 100),('词b', 90),('词c', 80)]
- # 从背景图片生成颜色值
- image_colors = ImageColorGenerator(alice_coloring)
-
- # 以下代码显示图片
- plt.imshow(wc)
- plt.axis("off")
- # 绘制词云
- plt.figure()
- # recolor wordcloud and show
- # we could also give color_func=image_colors directly in the constructor
- plt.imshow(wc.recolor(color_func=image_colors))
- plt.axis("off")
- # 绘制背景图片为颜色的图片
- plt.figure()
- plt.imshow(alice_coloring, cmap=plt.cm.gray)
- plt.axis("off")
- plt.show()
- # 保存图片
- wc.to_file(path.join(d, "名称.png"))
-