包含下面库(python3)
文件夹创建 os库
html分析 beautfulsoup库
时间控制 time库
多线程 threading库
正则表达式 re库
网页抓取 requests库
import re
import requests
import time
import os
import threading
from bs4 import BeautifulSoup
url1='http://www.ezdmw.com/Home/Index/contribution/up_atlas.html?type=&order=new&page='
url2='http://www.ezdmw.com'
headers = {'Referer':'www.baidu.com',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'}
root=os.getcwd()+'\Ezhan'
def xgname(name):
xname=''
for i in name:
if i in ['\\','/',':','*','?','<','>','|','"',' ','.']:
xname+='#'
else:
xname+=i
return xname
def hq(dz,url):
try:
#print('正在进行:',url)
#print('存入',dz)
img=requests.get(url,headers=headers).content
f=open(dz,'wb')
f.write(img)
f.close
except:
print(url,dz,'获取错误')
else:
print(url,dz,'获取成功')
def jm2(name,url):
tot=1
name=xgname(name)
dz=root+'\\'+name
if not os.path.exists(dz):
os.makedirs(dz)
wz=requests.get(url,headers=headers).text
dome=BeautifulSoup(wz,'html.parser')
k=dome.find_all('img')
dxc=[]
for i in k:
if(i.parent.parent.name=='pre'):
#print('--------')
#print(i.attrs.get('style'))
#print(i.attrs.get('src'))
tt=threading.Thread(target=hq , args=(dz+'\\'+str(tot)+'.jpg',i.attrs['src']))
tt.setDaemon(True)
dxc.append(tt)
tot+=1
print('加入任务中')
for i in dxc:
i.start()
for i in dxc:
i.join()
print('完成任务')
def jm1(url):
wz=requests.get(url,headers=headers).text
#print(wz)
dome=BeautifulSoup(wz,'html.parser')
for i in dome.find_all('a',style="color:#000;",target="_blank"):
if i.string!=None:
print(i.string)
print(url2+i.attrs['href'])
jm2(i.string,url2+i.attrs['href'])
if __name__ == "__main__":
if not os.path.exists(root):
os.makedirs(root)
for i in range(int(input('输入开始页数:')),11000):
url=url1+str(i)+'0'
jm1(url)
print('第',i,'页执行完毕')
这是什么呢?