注意事项:
- 生成词云的图片必须是黑白照片
- 豆瓣不登录用户只能访问到影评的前12页的数据
- 豆瓣影评数据只展示24页
- 本方法是上一篇博客的另一种写法
- 爬取豆瓣的全部内容目前还不太容易,真正可行的我还没见到,希望各位大侠能真正的爬出来分享。
import requests
from bs4 import BeautifulSoup
import time
import jieba
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import re
def getHtml(url):
try:
r = requests.get(url,headers={\'User-Agent\':\'Mozilla/5.0 (Windows NT 6.1; \\
WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36 QIHU 360EE\'},\\
cookies={\'cookie\':\'1012\'})
r.raise_for_status()
r.encoding = \"utf-8\"
return r.text
except:
print(\"到此为止。原因豆瓣不登录用户只能访问到影评的前12页的数据,\\
登录后的用户可以看到前24页的数据(豆瓣影评数据只展示24页\")
def getData(html):
f = open(\"new.txt\",\'wb+\')
soup = BeautifulSoup(html,\"html.parser\")
comment_list = soup.find(\'div\',attrs={\'class\':\'mod-bd\'})
for comment in comment_list.find_all(\'div\',attrs={\'class\':\'comment-item\'}):
comment_content = comment.find(\'span\',attrs={\'class\':\'short\'}).get_text()
f.write(comment_content.encode(\'utf-8\'))
def seg_sentence():
#实现句子的分词
final = \'\'
fn1 = open(\"new.txt\", \'r\',encoding=\'utf-8\').read() #加载爬取的内容
for k in range(len(fn1)):
final+=str(fn1[k]).strip()
filtrate= re.compile(r\'[\\u4e00-\\u9fa5]\')
filterdata = re.findall(filtrate, final)
newtxt=\'\'.join(filterdata)
#print(newtxt)
fn2 = open(\"new.txt\", \"w\", encoding=\'utf-8\')
fn2.write(newtxt)
def wordcloud(m):
# 加载图片
image = Image.open(\'fbb\'+str(m)+\'.png\', \'r\')
img = np.array(image)
# 词云
cut = open(\'new.txt\', \'r\',encoding=\'utf-8\')
cut_txt=cut.read()
cut.close()
newtxtls = jieba.lcut(cut_txt)
txtls=[]
for i in range(len(newtxtls)):
if len(newtxtls[i])!=1:
txtls.append(newtxtls[i])
continue
newtxt=\' \'.join(txtls)
#print(newtxt)
wordcloud = WordCloud(
mask=img, # 使用该参数自动忽略height,width
background_color=\'white\',
max_words=500, # 设置最大词数
max_font_size=40,
font_path=\"simhei.ttf\").generate(newtxt)
# 显示图片
plt.imshow(wordcloud, interpolation=\'bilinear\')
plt.axis(\'off\') # 去掉坐标轴
plt.show() #直接显示
wordcloud .to_file(\'wc\'+str(m)+\'.png\') #存为图片
plt.axis(\"off\")
def main():
k = 0 #start = k
i = 0
while k <300:
url = \'https://movie.douban.com/subject/26752088/comments?start=\' + str(k) + \'&limit=20&sort=new_score&status=P\'
k += 20
i += 1
print(\"正在爬取第\" + str(i) + \"页的数据\")
time.sleep(1) # 设置睡眠时间
html = getHtml(url)
getData(html)
seg_sentence()
wordcloud(i)
if __name__ == \"__main__\":
main()
继续阅读与本文标签相同的文章
-
B树和B+树索引原理
2026-05-18栏目: 教程
-
汇编(八)[bx] 、Loop指令
2026-05-18栏目: 教程
-
分布式关系型数据库服务 DRDS 提供 Chunk-Based 执行器,大幅优化复杂查询执行效率
2026-05-18栏目: 教程
-
2019云栖大会 | 究竟哪款NoSQL数据库最适合你?
2026-05-18栏目: 教程
-
CNCF 宣布成立应用交付领域小组,正式开启云原生应用时代
2026-05-18栏目: 教程
