import scrapy
import json
from cnki.items import CnkiItem
from urllib.parse import parse_qs,urlencode
from time import sleep
import random
keyword = '北京信息科技大学'
class CnkispiderSpider(scrapy.Spider):
name = "cnkispider"
allowed_domains = ["kns.cnki.net"]
start_urls = [f"https://kns.cnki.net/kns8s/search?kw={keyword}"]
def start_requests(self):
url = "https://kns.cnki.net/kns8s/brief/grid"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Referer': f'https://kns.cnki.net/kns8s/search?kw={keyword}',
}
data = {
'boolSearch': 'true',
'QueryJson': json.dumps({
"Platform": "",
"Resource": "CROSSDB",
"Classid": "WD0FTY92",
"Products": "",
"QNode": {
"QGroup": [{
"Key": "Subject",
"Title": "",
"Logic": 0,
"Items": [{
"Field": "SU",
"Value": keyword,
"Operator": "TOPRANK",
"Logic": 0,
"Title": "主题"
}],
"ChildItems": []
}]
},
"ExScope": 1,
"SearchType": 2,
"Rlang": "CHINESE",
"KuaKuCode": "YSTT4HG0,LSTPFY1C,JUP3MUPD,MPMFIG1A,EMRPGLPA,WQ0UVIAA,BLZOG7CK,PWFIRAGL,NN3FJMUV,NLBO1Z6R",
"Expands": {},
"SearchFrom": 1
}),
'pageNum': '1',
'pageSize': '20',
'sortField': '',
'sortType': 'des',
'dstyle': 'listmode',
'productStr': 'YSTT4HG0,LSTPFY1C,RMJLXHZ3,JQIRZIYA,JUP3MUPD,1UR4K4HZ,BPBAFJ5S,R79MZMCB,MPMFIG1A,EMRPGLPA,J708GVCE,ML4DRIDX,WQ0UVIAA,NB3BWEHK,XVLO76FD,HR1YT1Z9,BLZOG7CK,PWFIRAGL,NN3FJMUV,NLBO1Z6R',
'aside': '',
'searchFrom': '资源范围:总库',
'subject': '',
'language': '',
'uniplatform': '',
}
yield scrapy.FormRequest(
url=url,
method='POST',
headers=headers,
formdata=data,
callback=self.parse_search_fromtitle
)
def parse_search_fromtitle(self, response):
sleep(random.uniform(1, 3))
paper_li = response.xpath("//*[@class='result-table-list']/tbody/tr")
for paper in paper_li:
item=CnkiItem()
Title_li = paper.xpath("./td[@class='name']/a//text()").extract()
item['Title']="".join(Title_li)
Author_li = paper.xpath("./td[@class='author']/a//text()").extract()
item['Author']=" ".join(Author_li)
item['Source'] = paper.xpath("./td[@class='source']/p/a//text()").extract_first()
item['Time'] = paper.xpath("./td[@class='date']/text()").extract_first()
item['Database'] = paper.xpath("./td[@class='data']/span/text()").extract_first()
item['Quote'] = paper.xpath("./td[@class='quote']/text()").extract_first()
item['Download'] = paper.xpath("./td[@class='download']/text()").extract_first()
print(item)
is_nextpage=response.xpath("//a[@id='PageNext'][contains(@class, 'pagesnums')]")
if is_nextpage:
current_params = parse_qs(response.request.body.decode('utf-8'))
request_params = {
k: v[0] if isinstance(v, list) and len(v) > 0 else v
for k, v in current_params.items()
}
current_page=int(request_params.get('pageNum'))
request_params['pageNum'] = str(current_page + 1)
if current_page==1:
request_params['boolSearch'] = 'false'
next_url=response.request.url
headers=response.request.headers
yield scrapy.FormRequest(
url=next_url,
method='POST',
headers=headers,
formdata=request_params,
callback=self.parse_search_fromtitle,
dont_filter=True,
)