scrapy 初级使用 -> 微博热搜
你滴,看这个,效果大大的有
https://www.51cto.com/article/689338.html
import scrapy
import re
import json
import os
import datetime
class HotsearchSpider(scrapy.Spider):
name = "hotsearch"
allowed_domains = ["s.weibo.com"]
start_urls = ["https://s.weibo.com/top/summary"]
def parse(self, response):
alldata = {}
datas = response.xpath("//*[@id='pl_top_realtimehot']/table/tbody/tr")
for i,data in enumerate(datas):
i = str(i)
alldata[i] = {}
title = data.xpath("./td/a/text()").extract_first()
hot = data.xpath("./td/span/text()").extract_first()
# 信息处理
if(hot == None or hot == ' ' or hot == ''):
hot = '0'
title = title.strip()
hot = re.search(r'\d+', hot).group().strip()
alldata[i]['title'] = title
alldata[i]['hot'] = hot
# 单引号向双引号转换
alldata = json.dumps(
alldata,
ensure_ascii=False
)
# 写入json文件
os.chdir(os.getcwd().replace("spiders", ""))
targetpath = "data"
if (os.path.exists(targetpath) == True):
print("存在")
else:
print("不存在,已为您创建 data 文件夹")
os.mkdir(targetpath)
os.chdir(targetpath)
# windows 不支持文件中含有 :
date = str(datetime.datetime.now()).replace(":",".")
with open("hotsearch-{}.json".format(date), "w",encoding="utf-8") as f:
f.write(alldata)
修改与2023年6月27日 13点04分
才爬了几次就不行了,哎。。友友们,暴力爬不可取,水比,水完了,hh
【改】:我上面写错了,又改了