我的第三个爬虫

历史记录

清除记录

猜你想搜

AcWing热点
App
登录/注册

我的第三个爬虫

作者：

ZTEG , 2020-08-05 14:59:09 , 所有人可见 , 阅读 1151

包含下面库（python3）
文件夹创建 os库
html分析 beautfulsoup库
时间控制 time库
多线程 threading库
正则表达式 re库
网页抓取 requests库

import re
import requests
import time
import os
import threading
from bs4 import BeautifulSoup

url1='http://www.ezdmw.com/Home/Index/contribution/up_atlas.html?type=&order=new&page='
url2='http://www.ezdmw.com'
headers = {'Referer':'www.baidu.com',
           'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
root=os.getcwd()+'\Ezhan'

def xgname(name):
    xname=''
    for i in name:
        if i in ['\\','/',':','*','?','<','>','|','"',' ','.']:
            xname+='#'
        else:
            xname+=i
    return xname

def hq(dz,url):
    try:
        #print('正在进行:',url)
        #print('存入',dz)
        img=requests.get(url,headers=headers).content
        f=open(dz,'wb')
        f.write(img)
        f.close
    except:
        print(url,dz,'获取错误')
    else:
        print(url,dz,'获取成功')

def jm2(name,url):
    tot=1
    name=xgname(name)
    dz=root+'\\'+name
    if not os.path.exists(dz):
        os.makedirs(dz)
    wz=requests.get(url,headers=headers).text
    dome=BeautifulSoup(wz,'html.parser')
    k=dome.find_all('img')
    dxc=[]
    for i in k:
        if(i.parent.parent.name=='pre'):
            #print('--------')
            #print(i.attrs.get('style'))
            #print(i.attrs.get('src'))
            tt=threading.Thread(target=hq , args=(dz+'\\'+str(tot)+'.jpg',i.attrs['src']))
            tt.setDaemon(True)
            dxc.append(tt)
            tot+=1
    print('加入任务中')
    for i in dxc:
        i.start()
    for i in dxc:
        i.join()
    print('完成任务')


def jm1(url):
    wz=requests.get(url,headers=headers).text
    #print(wz)
    dome=BeautifulSoup(wz,'html.parser')
    for i in dome.find_all('a',style="color:#000;",target="_blank"):
        if i.string!=None:
            print(i.string)
            print(url2+i.attrs['href'])
            jm2(i.string,url2+i.attrs['href'])


if __name__ == "__main__":
    if not os.path.exists(root):
        os.makedirs(root)
    for i in range(int(input('输入开始页数：')),11000):
        url=url1+str(i)+'0'
        jm1(url)
        print('第',i,'页执行完毕')

1 评论

MournInk 2020-08-06 08:27

这是什么呢？

App 内打开