import requests
class TiebaSpider:
	def __init__(self,tieba_name):
		\"\"\"
		初始化参数,完成基础配置
		\"\"\"
		self.tieba_name = tieba_name
		self.url_  = \"https://tieba.baidu.com/f?kw=\" + tieba_name_crawl + \"&ie=utf-8&pn={}\"
		self.headers = {\"User-Agent\":\"WSF\"}
		
	def make_url_lists(self):
		\"\"\"
		生成下载列表
		\"\"\"
		return [self.url_ .format(i) for i in range(1,11)]
	
	def download_url(self,url_str):
		\"\"\"
		使用requests get方法下载指定页面,并返回页面效果
		\"\"\"
		result = requests.get(url_str,headers = self.headers)
		return result.content
	
	def save(self,result,page_num):
		\"\"\"
		存储下载内容
		\"\"\"
		file_path = \"{}-第{}页.html\".format(self.tieba_name,page_num)
		with open(file_path,\"wb\") as f:
			f.write(result)
		
	def run(self):
		\"\"\"
		下载主线程,实现主要的下载逻辑
		\"\"\"
		url_lists = self.make_url_lists()
		for url_str in url_lists:
			result_str = self.download_url(url_str)
           		 p_num      = url_lists.index(url_str) + 1
            		self.save_result(result_str,p_num)
            		
if __name__ == \'__main__\':
    tieba_spider = TiebaSpider(\"薛之谦\")
    tieba_spider.run()
	

理解 session 和 cookie

session:当用户访问http-server时,会生成一个sessionID(唯一标识),在一定访问周期内可用,在浏览网页时会将记录保存在cookie中,下次访问有缓存记录.

session 服务器端生成一个字符串存储在某个用户的唯一标识.用来唯一标识客户端的访问(如健身中心会员卡)

cookie 存储在客户机的数据,其中含有sessionID,发送给服务器后表明用户身份.

import l .html

import requests
import re

def parse_form(html):
    tree = l .html.fromstring(html)
    data = {}
    for e in tree.cssselect(\'form input\'):
        if e.get(\'name\'):
            data[e.get(\'name\')] = e.get(\'value\')
    return data

def get_cookie():
    s = requests.session()
    result = s.get(\'http://example.webscraping.com/places/default/user/login?_next=/places/default/index\')
    post_data = parse_form(result.text)
    print(s.cookies.get_dict())
    login_url =\'http://example.webscraping.com/places/default/user/login?_next=/places/default/index\'
    post_data[\'email\']= \'1432786767@qq.com\'
    post_data[\'password\'] = \'2336517498\'
    s.post(login_url,post_data)
    rs = s.post(\'http://example.webscraping.com/places/default/user/login?_next=/places/default/index\')

    with open(\'login1.html\',\'w+\') as f:
        f.write(rs.text)

if __name__ == \'__main__\':
    get_cookie()

收藏 打印