1:创建项目

\"\"

2:创建爬虫

\"\"

3:编写start.py文件用于运行爬虫程序

# -*- coding:utf-8 -*-
#作者:    baikai  
#创建时间: 2018/12/14 14:09 
#文件:    start.py  
#IDE:    PyCharm
from scrapy import cmdline

cmdline.execute(\"scrapy crawl js\".split())

4:设置settings.py文件的相关设置

\"\"

\"\"

\"\"

爬取详情页数据

\"\"

编写items.py文件

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class ArticleItem(scrapy.Item):
    # 定义我们需要的存储数据字段
     =scrapy.Field()
    content=scrapy.Field()
    article_id=scrapy.Field()
    origin_url=scrapy.Field()
    author=scrapy.Field()
    avatar=scrapy.Field()
    pub_time=scrapy.Field()

编写js.py

# -*- coding: utf-8 -*-
import scrapy
from scrapy. extractors import  Extractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu_spider.items import ArticleItem

class JsSpider(CrawlSpider):
    name = \'js\'
    allowed_domains = [\'jianshu.com\']
    start_urls = [\'https://www.jianshu.com/\']

    rules = (
        # 匹配地址https://www.jianshu.com/p/d8804d18d638
        Rule( Extractor(allow=r\'.*/p/[0-9a-z]{12}.*\'), callback=\'parse_detail\', follow=True),
    )

    def parse_detail(self, response):
        # 获取内容页数据并解析数据
         =response.xpath(\"//h1[@class=\' \']/text()\").get()
        #作者图像
        avatar=response.xpath(\"//a[@class=\'avatar\']/img/@src\").get()
        author=response.xpath(\"//span[@class=\'name\']/a/text()\").get()
        #发布时间
        pub_time=response.xpath(\"//span[@class=\'publish-time\']/text()\").get()
        #详情页id
        url=response.url
        #https://www.jianshu.com/p/d8804d18d638
        url1=url.split(\"?\")[0]
        article_id=url1.split(\"/\")[-1]
        #文章内容
        content=response.xpath(\"//div[@class=\'show-content\']\").get()

        item=ArticleItem(
             = ,
            avatar=avatar,
            author=author,
            pub_time=pub_time,
            origin_url=response.url,
            article_id=article_id,
            content=content
        )
        yield item

 

收藏 打印