第一个爬虫程序
直接运行代码,即可运行。点滴学习点滴进步。
# -*- coding:utf-8 -*-
import re
import requests
from bs4 import BeautifulSoup
from urllib import request
from urllib import error
# 此测试首页是否可以链接
def url_get(num_retries=5):
# url = input(\"请输入要爬取的首页url:\")
url = \"http://www.newchinalife.com/ncl/cn/new/index/index.shtml\"
# url = \"http://www.newchinalife.comindex.html/\"
try:
# 做一个user-agent模拟浏览器发送请求,也可以加入其它字段
kv = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko\'}
requests.get(url, headers=kv)
return url
except error.URLError or error.HTTPError as e:
if num_retries > 0:
if hasattr(e, \'code\') and 500 <= e.code < 600:
url_get(num_retries - 1)
print(\"url无法连接\")
# 此函数用于提取各链接网站下的所有链接
def spiderpage(url):
try:
kv = {\'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; WOW64) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER\'}
r = requests.get(url, headers=kv)
r.encoding = r.apparent_encoding
pagetext = r.text
# 正则表达式表示要爬取的是<a href=\"和\"中的内容,\"或\'都可以,即当前页面下所有的链接url,返回列表
page s = re.findall(r\'(?<=<a href=\\\").*?(?=\\\")|(?<=href=\\\').*?(?=\\\')\', pagetext)
# print(page s)
return page s
except:
page s = [\'http://www.newchinalife.com/ncl/cn/new/index/index.shtml\']
print(\"这个网站有点东西\")
return page s
# 此函数用来检测链接是否为外网链接或者不合格链接
def get (url):
# 检验是否为本站链接,防止死循环爬取,如链接跳出本站则不进行操作
headers = {
\'Accept\': \'*/*\',
\'Accept-Language\': \'en-US,en;q=0.8\',
\'Cache-Control\': \'max-age=0\',
\'User-Agent\': \'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36\',
\'Connection\': \'keep-alive\',
\'Referer\': \'http://www.baidu.com/\'
}
print(url)
req = request.Request(url, headers=headers)
html = None
try:
response = request.urlopen(req)
html = response.read().decode(\'utf-8\')
soup = BeautifulSoup(html, \"html.parser\")
if soup.body is not None:
url_list = soup.head.
= url_list.string
print( )
if != None:
return
else:
return \"这网站没有灵性\"
else:
= \"不可加载\"
return
# except error.URLError or error.HTTPError or error.UnicodeDecodeError:
except:
print(\"这网站没有灵性\")
return \"不可加载\"
# 正则删选函数
def url_filtrate(page s):
same_target_url = []
try:
for murl in page s:
murl = re.sub(r\'\\s+\', \'\', murl)
if re.findall(\"^java\", murl) or re.findall(\"^jse\", murl) or re.findall(\"^ALL\", murl) or re.findall(\"pdf$\",
murl) or re.findall(
\"^login\", murl) or re.findall(\"css$\", murl) or re.findall(\"@\", murl):
page s.remove(murl)
elif re.findall(\"^http\", murl) and re.findall(\"newchinalife\", murl) is None:
page s.remove(murl)
elif re.findall(\"^http\", murl):
murl = str(murl)
same_target_url.append(murl)
elif re.findall(\"^java\", murl) or re.findall(\"^jse\", murl) or re.findall(\"^ALL\", murl) or re.findall(\"pdf$\",
murl) or re.findall(
\"^login\", murl):
page s.remove(murl)
elif re.findall(\"gsp$\", murl) or re.findall(\"shtml$\", murl) or re.findall(\"[0-9]*$\", murl):
murl = \"https://www.newchinalife.com\" + str(murl)
same_target_url.append(murl)
elif re.findall(\"^/\", murl):
murl = \"https://www.newchinalife.com\" + str(murl)
same_target_url.append(murl)
else:
pass
except ValueError as e:
pass
# 去除重复url
unrepect_url = []
for l in same_target_url:
if l not in unrepect_url:
unrepect_url.append(l)
print(unrepect_url)
return unrepect_url
class Quence:
def __init__(self):
# 已访问的url集合
self.visited = []
# 待访问的url集合
self.unvisited = []
# 获取访问过的url队列
def getvisitedurl(self):
return self.visited
# 获取未访问的url队列
def getunvisitedurl(self):
return self.unvisited
# 添加url到访问过得队列中
def addvisitedurl(self, url):
return self.visited.append(url)
# 移除访问过得url
def removevisitedurl(self, url):
return self.visited.remove(url)
# 从未访问队列中取一个url
def unvisitedurldequence(self):
try:
return self.unvisited.pop()
except:
return None
# 添加url到未访问的队列中
def addunvisitedurl(self, url):
if url != \"\" and url not in self.visited and url not in self.unvisited:
return self.unvisited.insert(0, url)
# 获得已访问的url数目
def getvisitedurlount(self):
return len(self.visited)
# 获得未访问的url数目
def getunvistedurlcount(self):
return len(self.unvisited)
# 判断未访问的url队列是否为空
def unvisitedurlsempty(self):
return len(self.unvisited) == 0
class Spider():
def __init__(self, url):
self. Quence = Quence() # 将队列引入本类
self. Quence.addunvisitedurl(url) # 传入待爬取的url,即爬虫入口
# 真正的爬取链接函数
def crawler(self, urlcount):
# 子页面过多,为测试方便加入循环控制子页面数量
x = 1
while self. Quence.unvisited or x == urlcount:
# 若子页面不是很多,可以直接使用队列中的未访问列表非空作为循环条件
# while not self. Quence.unvisitedurlsempty():
if x > 1:
print(f\"第{x-1}个url,开始爬\")
visitedurl = self. Quence.unvisitedurldequence() # 从未访问列表中pop出一个url
if visitedurl is None or visitedurl == \'\':
continue
= get (visitedurl)
if re.findall(\"新华保险\", ): # 如果跳出本站则pass
initial_ s = spiderpage(visitedurl) # 爬出该url页面中所有的链接
right_ s = url_filtrate(initial_ s) # 筛选出合格的链接
if not right_ s:
pass
else:
self. Quence.addvisitedurl(visitedurl) # 将该url放到访问过的url队列中
for in right_ s: # 将筛选出的链接放到未访问队列中
self. Quence.addunvisitedurl( )
x += 1
else:
pass
print(f\"爬完了\")
return self. Quence.visited
# 写文件函数
def writetofile(urllist):
# 写入网站并计数
x = 1
for url in urllist:
# Furls.txt用于保存链接
file = open(\'Furls.txt\', \'a\', encoding=\'utf8\')
file.write(f\'{url}\\n\')
x += 1
file.close()
print(f\'写入已完成,总计{x-1}个网页的子链接\')
# 主循环
if __name__ == \'__main__\':
url = url_get()
spider = Spider(url)
# 传入要爬取的子链接数量
urllist = spider.crawler(5000)
writetofile(urllist)
继续阅读与本文标签相同的文章
VUEX学习笔记
-
数据库基础技术实践#网络安全基础技术实践课程
2026-05-18栏目: 教程
-
MySQL每组求最值的记录与每组前N条记录
2026-05-18栏目: 教程
-
OCP-052考试题库汇总(55)-CUUG内部解答版
2026-05-18栏目: 教程
-
【云栖活动】架构师、产品经理一对一座谈会/WORKSHOP-已截止
2026-05-18栏目: 教程
-
MySQL入门书籍和方法分享
2026-05-18栏目: 教程
