2020-12-16课程笔记
早上七点学习了两节Python爬虫课,认识到了正则表达式匹配关键信息和路径表达式匹配提取信息
并尝试改造爬取需要登录的网页未能成功,代码如下:
import requests
import re
import time
officialaccount_id = '695'
article_id = '15233'
sleeptime=10
def crawling():
session_request=requests.session()
login_url='https://szjz.pkpm.cn'
result=session_request.post(
login_url,
data={
'loginName':'admin',
'password':'*********************',
'terminal':'1',
'InviteUserId':''
},
headers = dict(referer=login_url)
)
body_message=result.content.decode('utf-8')
user_key=''.join(re.findall('"userKey":"(.*?)",',body_message))
url='https://szjz.pkpm.cn/GetFansStatistics?userKey='+user_key+'&oaId='+officialaccount_id
res=session_request.get(
url,
headers = dict(referer = url)
)
indexmessage=res.json()
index_bodyMessage = indexmessage["bodyMessage"]
addfans =''.join(re.findall('addFans":(.*?),',index_bodyMessage))
allfans = ''.join(re.findall('allFans":(.*?),', index_bodyMessage))
allread= ''.join(re.findall('allRead":(.*?)}', index_bodyMessage))
print('addfans:'+addfans)
print('allfans:' + allfans)
print("allRead:"+allread)
crawling()
希望后续老师专门开一套爬取需要登录网站的视频
|