首先介绍一下我自己写的一个爬取工具BusinessTool.exe
- 使用方法:
- 只要在config.txt中配置相应的type和url类型即可。
- 修改这俩部分
- 其中url指的是需要爬取的微博博主的主页url、或者抖音、小红书、b站的链接
- type指的是对应的链接类型:1:微博,2抖音,3快手,4.b站 5.小红书
- 配置结束后,运行
BusinessTool.exe
即可爬取到相应的粉丝数
实例代码
这个是将我写了一天的代码大放送,免费赠给大家
import requests
from bs4 import BeautifulSoup
import json
from time import strftime, localtime
from fontTools.ttLib import TTFont
import re
import os
# 定义全局变量
G_WEIBO = '1'
G_DOUYIN = '2'
G_KUAISHOU = '3'
G_BZHAN = '4'
G_XIAOHONGSHU = '5'
LOGIN_COOKIES = 'SINAGLOBAL=522519899039.0867.1574345484645; UM_distinctid=16f5c2b0fd5388-06657092a336d8-6701b35-1fa400-16f5c2b0fd621e; SUHB=02Meot-9jOoPJy; ALF=1621327072; SUB=_2AkMploOff8NxqwJRmP4Qzm3ja4p3zA_EieKfynJEJRMxHRl-yT9kqhAJtRB6AhatcJAakTfQ4KKZBahMLkPKCqGmy6qa; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WhZH_vEymdzQfyYRooMOfCs; UOR=,,news.ifeng.com; YF-Page-G0=913e50d6fa3a3406e80cc7f737d4352f|1590646466|1590646466; _s_tentry=-; Apache=7843672000587.247.1590646468304; ULV=1590646468380:10:6:3:7843672000587.247.1590646468304:1590636352909; YF-V5-G0=4e19e5a0c5563f06026c6591dbc8029f'
cookies2 = dict(map(lambda x: x.split('='), LOGIN_COOKIES.split(";")))
# 爬取抖音
ttfont = TTFont('111.woff')
best_cmap = ttfont['cmap'].getBestCmap()
def get_best_cmap():
'''
这个函数用来返回映射表
:return: 返回映射表
'''
new_best_cmap={}
for key , value in best_cmap.items():
# print(hex(key),value)
new_best_cmap[hex(key)] = value
return new_best_cmap
def get_num_cmap():
num_map={
"x":"","num_":"1","num_1":"0",
"num_2":"3","num_3":"2",
"num_4":"4","num_5":"5",
"num_6":"6","num_7":"9",
"num_8":"7","num_9":"8",
}
return num_map
def get_html(url):
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36'}
response = requests.get(url=url,headers=headers).text
return response
def replace_num_and_cmap(result,response):
for key, value in result.items():
if key in response:
response = re.sub(key,value , response)
return response
#baocun
def save_to_file(response):
with open('douyin.html','w',encoding='utf-8')as fp:
fp.write(response)
def map_cmap_num(get_best_cmap,get_num_cmap):
result = {}
for key, value in get_best_cmap().items():
key = re.sub('0','&#',key,count=1)+ ';'
result[key] = get_num_cmap()[value]
return result
def ScrapyDouYin(url,uid):
'''
爬取抖音粉丝数
:param url:
:return:
'''
result = map_cmap_num(get_best_cmap, get_num_cmap)
response = get_html(url)
response = replace_num_and_cmap(result, response)
bs_ = BeautifulSoup(response, 'lxml')
div_ = bs_.find('span', class_='follower block')
follower_num = div_.text # 获取粉丝数目
fans_numbers = follower_num.replace(" ","")
fans_number = fans_numbers.replace("粉丝",'')
print('粉丝数目:'+fans_number)
print('发送请求')
PostReq(url,'2',fans_number,uid)
#endend
def fans(mid, name=-1):
mid = str(mid)
name = str(name)
if name == -1:
name = mid
url = "https://api.bilibili.com/x/relation/stat?vmid=" + mid + "&jsonp=jsonp"
resp = requests.get(url)# 通过url爬取到我们想要的json数据
info = eval(resp.text)
fans_number = info['data']['follower']
return fans_number
def PostReq(url,type,fans,uid='130'):
'''
发送请求给后台 写入数据
:param url: 地址Url
:param type: 类型 1:微博,2抖音,3快手,4.b站 5.小红书
:param fans: 粉丝数目
:param uid: 默认130
:return:
'''
url_ = 'http://xmk.oywblog.com/service4/user/buyer/User_fans/update_user_social'
json_ = {'type':type,'url':url,'fans':fans,"uid":uid}
res = requests.post(url_,data=json_)
print(res.status_code)
print(res.content)
def ScrapyWB(url,uid):
'''
获取微博粉丝数目
:param url:
:return:
'''
headers_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
res = requests.get(url,headers = headers_,cookies=cookies2)
print(res.url)
req = requests.get(res.url,headers = headers_,cookies=cookies2)
res_str = req.text
pos_ = res_str.find('粉丝(')
num_str = res_str[pos_:pos_+15]
pos_left = num_str.find('(')
pos_right = num_str.find(')')
fans_number = num_str[pos_left+1:pos_right] #获取粉丝数
print('粉丝数目:'+fans_number)
print('开始发送请求')
PostReq(url,'1',fans_number,uid)
def ScrapyXiaoHongShu(url,uid):
'''
获取小红书粉丝数目
:param url:
:return:
'''
headers_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
res = requests.get(url, headers=headers_)
bs_ = BeautifulSoup(res.text,'lxml')
div_ = bs_.find('div',class_='card-info')
span_ = div_.find_all('span',class_='info-number')
fans_num = span_[1].text
print('粉丝数目:'+fans_num) # 获取粉丝数
fans_number = fans_num.strip() #获取粉丝数
print('开始发送请求')
PostReq(url, '5', fans_number,uid)
def ScrapyKuaiShou(url):
'''
获取爬取的粉丝数
:param url:
:return:
'''
headers_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
res = requests.get(url, headers=headers_)
def ScrapyBZhan(url,uid):
'''
爬取B站
:param url:
:return:
'''
url_pos_left = url.find('com/')
pors_ = url.find('/',url_pos_left+4)
uid_num = url[url_pos_left+4:pors_] #获取uid
headers_ = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36'}
#res = requests.get(url, headers=headers_)
#res.encoding='utf-8'
# bs_ = BeautifulSoup(res.text,'lxml')
fans_number = fans(uid_num) #获取粉丝数
print("粉丝数目:" + str(fans_number))
print('发送请求写入数据')
PostReq(url,'4',str(fans_number),uid)
def ScrapyData(url,type_,uid):
'''
:param url:
:param type:
:param uid:
:return:
'''
print('url信息:'+url)
print('type信息:' +type_)
print('uid信息:' + uid)
print('读取完毕')
if type_ == G_WEIBO:
print('微博粉丝数爬取')
ScrapyWB(url,uid)
if type_ == G_BZHAN:
print('B站粉丝数爬取')
ScrapyBZhan(url,uid)
if type_ == G_DOUYIN:
print('抖音粉丝数爬取')
ScrapyDouYin(url,uid)
if type_ == G_KUAISHOU:
print('快手粉丝数爬取')
ScrapyKuaiShou(url)
if type_ == G_XIAOHONGSHU:
print('小红书粉丝数爬取')
ScrapyXiaoHongShu(url,uid)
def ReadFromConfig(file_name):
'''
读取配置信息
:return:
'''
f = open(file_name,'r')
rs_ = ''
for lis_ in f:
rs_ = rs_ +lis_
json_list = list(eval(rs_))
for json_str in json_list:
url_ = json_str['url']
type_ = json_str['type']
uid_ = json_str['uid']
ScrapyData(url_,type_,uid_)
return
if __name__ == '__main__':
print('初始化读取配置信息config文件夹')
# 读取文件夹
files = os.listdir('config/')
for filename in files:
print(filename)
config_info = ReadFromConfig('config/'+filename)
源码及工具下载链接:https://download.csdn.net/download/Giser_D/12473002
最后如果大家有想学爬虫相关的需求,可以加入这个微信群,一起学习一起进步。
以及如果觉得对大家真的有帮助的话,大家也可以扫描二维码 请博主喝奶茶