需要实现的功能:给出一个网站列表,抓出这些网页上的图片。

实现方式: 下载网页源码,在源码中识别包含图片url的标签,如<img>,<div>,<li>。由于对html了解较少,哪些标签可能含有图片是从查看多个网站的源码中总结出来的。

调用的库:Selenium(加载Chrome驱动)--获取执行JS后的源码。

  threading--实现多进程

代码:

\"\"\"\"
from urllib.parse import urljoin,urlparse
import os
import threading
from time import ctime
from selenium import webdriver
import re

class myThread(threading.Thread):
    def __init__(self,func,args,name=\'\'):
        threading.Thread.__init__(self)
        self.name=name
        self.func=func
        self.args=args
        self.is_end=False


    def getResult(self):
        return self.res

    def run(self):
        self.res=self.func(*self.args)

def filter_in_tag(page_file,tag):
    url_in_tag = []
    url_in_tag.append(\'------------------%s--------------------\' % (tag))
    with open(page_file, \'r\', encoding=\'utf-8\') as jj:
        for line in jj:
            ##先找出li所有标签
            reg = \'<%s [^>]*>\' % (tag)
            all_tag_str = re.findall(reg, line)

            for tag_str in all_tag_str:
                if re.search(\'https?://[^\\\'\\\"\\)]+\', tag_str):
                    url_in_tag.extend(re.findall(\'http?://[^\\\'\\\"]+\', tag_str))
    return url_in_tag
def process(m_url):
    imgs,big_files,hrefs=[],[],[]
    ##先找出图片
    ##添加参数,使chrome不出现界面
    chrome_options = webdriver.chrome.options.Options()
    chrome_options.add_argument(\'--headless\')
    chrome_options.add_argument(\'--disable-gpu\')
    driver = webdriver.Chrome(r\'C:\\Program Files (x86)\\Google\\Chrome\\Application\\chromedriver.exe\',
                              chrome_options=chrome_options)  ##driver = webdriver.PhantomJS(executable_path=\'/bin/phantomjs/bin/phantomjs\')#如果不方便配置环境变量。就使用phantomjs的绝对路径也可以

    driver.set_page_load_timeout(30)
    try:
        driver.get(m_url)
    except Exception as e: ##(HTTPError,URLError,UnicodeDecodeError,WindowsError) as e:
        err_info=\'url open error: %s\\n, reason: %s\\n\'%(m_url,e)
        print(err_info)
        err_log.write(err_info)
        #print(\'url open error: %s\\n, reason: %s\'%(m_url,e))
        return []
    imgs = []
    imgs.append(\'------------------<img src=>-----------------\')
    for x in driver.find_elements_by_tag_name(\"img\"):
        imgs.append(x.get_attribute(\'src\'))

    # 找出所有div li标签中的链接
    with open(\'tmp_page_source.html\',\'w\',encoding=\'utf-8\') as tmp_f:
        tmp_f.write(driver.page_source)
    for tag in (\'li\', \'div\'):
        imgs.extend(filter_in_tag(\'tmp_page_source.html\',tag))

    ##列表去重复
    imgs_uniq = []
    for url in imgs:
        if (url not in imgs_uniq) and (url): ##url不在新列表中且url不为空
            imgs_uniq.append(url)

    ##查找页面中的a链接中的大文件和其它网页
     s=[a_ .get_attribute(\'href\') for a_  in driver.find_elements_by_tag_name(\'a\') if a_ .get_attribute(\'href\')]
    driver.quit()

    for   in  s:
        host = urlparse(m_url).netloc.split(\'@\')[-1].split(\':\')[0]
        dom = \'.\'.join(host.split(\'.\')[-2:])
        if  .startswith(\'mailto:\'):
            continue
        if   not  .startswith(\'http\'):
             =urljoin(m_url, )
        f_name = urlparse( ).path.split(\'/\')[-1]
        f_type = os.path.splitext(f_name)[1]
        if f_type not in (\'.htm\',\'.html\',\'shtml\',\'\'):
            big_files.append( )
            continue
        if   in seen_ s:
            pass#print( ,\'--aleady processed,pass.\')
        else:
            if dom not in  :
                pass#print( ,\'--not in domain,pass.\')
            else:
                hrefs.append( )
                seen_ s.append( )
    return imgs_uniq,big_files,hrefs




##对process处理结果进行分析,得出如下统计数据:
##图片:100,HTTP协议占比:80%,HTTP协议下各种后缀的数量:jpg-50,gif-30
##大文件:10,HTTP协议占比:100%,HTTP协议下各种后缀的数量:pdf-10

def ret_analyse(url_list):
    to_len=len(url_list)##含有3行标识信息,非url
    http_list= [url for url in url_list if url.startswith(\"http://\")]
    http_perc=\'%.1f%%\'%(len(http_list)/to_len*100) if to_len>0 else \'0\'
    exts_dict={}
    for url in url_list:
        if url.startswith(\'-----------\'): ##排除‘-------img:src-----’等
            continue
        f_name = urlparse(url).path.split(\'/\')[-1]
        f_type = os.path.splitext(f_name)[1]
        if f_type not in exts_dict:
            exts_dict[f_type]=1
        else:
            exts_dict[f_type]+=1
    return to_len,http_perc,exts_dict

##对一组url调用process函数处理,并输出结果到文本
def group_proc(url_f , urls,is_analyse) :
     s=[] ##存储该页面除大文件外的a链接
    ##定义写日志的函数
    def wLog(*lines):
        for line in lines:
            try:
                url_f.write(line + \'\\n\')
            except Exception as e:
                print(\'write eror,line:%s, err: %s\'%(line,e))
    for url in urls:
        proc_ret=process(url)
        if  proc_ret:
            img_list,bigfile_list, _list=proc_ret
            wLog(\'*\'*40,\'from: \',url) # 分隔行+起始行
            if is_analyse:
                img_output=\'图片:%d,HTTP协议占比:%s,HTTP协议下各种后缀的数量:%s\'%(ret_analyse(img_list)[0]-3,ret_analyse(img_list)[1],ret_analyse(img_list)[2]) ##图片含有3行标识信息
                big_output = \'大文件:%d,HTTP协议占比:%s,HTTP协议下各种后缀的数量:%s\' % (ret_analyse(bigfile_list))
                wLog(img_output,big_output)
            img_list = \'\\n\'.join(img_list)
            bigfile_list = \'\\n\'.join(bigfile_list)
            wLog(\'imgs:\',img_list,\'bigfiles: \',bigfile_list,\'*\'*40)

            imgs_f.write(img_list + \'\\n\')
            if bigfile_list:
                bigfiles_f.write(bigfile_list + \'\\n\')
            if  _list:
                 s.extend( _list)
    return  s


def main(depth):
    u_file=open(\'urls.txt\',\'r\')
     s=[line.strip(\'\\n\') for line in u_file]
     s=[\'http://\'+  for   in  s if not  .startswith(\'http\')]
    u_file.close()

    for i in range(depth):
        is_analyse=True if i==0 else False ##对第一层数据需要分析统计
        url_f = open(\' \' + str(i)+\'.txt\',\'w\')
        next_ s=[]

        if not  s:
            break
        else:
            print(\'第 %d 层开始爬取...\'%(i))
        ##将链接分配给5组
        avg=len( s)//5
         s_grp=[]
        if avg==0:
            grp_len=len( s)
            for i in range(grp_len):
                 s_grp.append([ s[i]])
        else:
            grp_len = 5
             s_grp= s[:avg], s[avg:avg*2], s[avg*2:avg*3], s[avg*3:avg*4], s[avg*4:]
        #for i in range(grp_len):
            #url_f.write(\' _group %d:%s\'%(i, s_grp[i]))
       ##新建5个线程,分别处理5组url
        threads=[]
        for i in range(grp_len):
            t=myThread(group_proc,(url_f, s_grp[i],is_analyse),group_proc.__name__)
            threads.append(t)
        ##线程同时启动
        for i in range(grp_len):
            print(\'线程%d开始运行,时间:%s\'%(i,ctime()))
            threads[i].setDaemon(True)
            threads[i].start()

        ##等待线程结束,结束后将各组url中获取的外链加入到下一次处理的列表中
        for i in range(grp_len):
            threads[i].join()
            print(\'线程%d运行结束,时间:%s\' % (i, ctime()))
            ret_ s=threads[i].getResult()
            next_ s.extend(ret_ s)
         s=next_ s
        url_f.close()


if __name__==\'__main__\':
    seen_ s = []
    imgs_f = open(\'图片.txt\', \'w\',encoding=\'utf-8\')
    bigfiles_f = open(\'大文件.txt\', \'w\',encoding=\'utf-8\')
    err_log = open(\'err_log.txt\', \'w\',encoding=\'utf-8\')
    depth=int(input(\'请输入爬取深度:\'))
    main(depth)
    err_log.close()
    imgs_f.close()
    bigfiles_f.close()
    input(\'按任意键退出...\')
View Code

 

收藏 打印