代码案例
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
# 不发出警告
def get_urls(n):
'''
【分页网页url采集】函数
n:页数参数
'''
lst = []
for i in range(n):
lst.append('https://movie.douban.com/subject/26100958/photos?type=S&start=%i&sortby=like&size=a&subtype=a'%(i*30))
# 这里注意,复制网页后,‘%E7%94%B5%E5%BD%B1’需要改成‘%%E7%%94%%B5%%E5%%BD%%B1’
return lst
def get_pic(ui,d_h,d_c):
'''
【数据采集】函数
ui:数据信息网页
d_h:user-agent信息
d_c:cookies信息
得到图片的dict,包括图片id和图片src
'''
ri = requests.get(url = ui,headers=d_h,cookies=d_c) # 访问网页
soupi = BeautifulSoup(ri.text, 'lxml') # 解析网页
lis = soupi.find('ul',class_="poster-col3 clearfix").find_all('li')
piclst = []
for li in lis:
dic = {}
dic['picname'] = li['data-id']
dic['picsrc'] = li.find('img')['src']
piclst.append(dic)
return piclst
def save_pic(picdic):
'''
【数据采集】函数
picdic:图片存储的字典,包括图片id和图片src
'''
img = requests.get(url = picdic['picsrc'])
# 访问网页
with open('p' + picdic['picname'] + '.jpg', 'ab') as f:
f.write(img.content)
f.close()
# 写入文件
if name == "main":
########################################################################################
os.chdir('D:/python/爬虫/pics/')
#设置图片存储路径
urllst = get_urls(2)
print(urllst)
# 获取分页网址
h_dic = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
# 获取user-agent
cookies = 'bid=6fdpHZMOVvg; gr_user_id=1ca72435-cf9b-40dc-ae6f-b52754273307; _vwo_uuid_v2=D6A85640C7C2478BBA6CED1195481B3FF|f162ecf0887c0c18b19113d426800c90; __gads=ID=068af084363458ea:T=1577801166:S=ALNI_MajzUa1owKHcehSyR8YVFpCtJWNvQ; __utmz=30149280.1577906690.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; dbcl2="208769186:KXNlW8fWGvg"; push_noty_num=0; push_doumail_num=0; __utmv=30149280.20876; ct=y; ll="108306"; ap_v=0,6.0; __utma=30149280.1098002445.1577801161.1577989880.1578339375.5; __utmb=30149280.0.10.1578339375; __utmc=30149280; __utma=223695111.849371552.1578339375.1578339375.1578339375.1; __utmb=223695111.0.10.1578339375; __utmc=223695111; __utmz=223695111.1578339375.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _pk_ses.100001.4cf6=*; __yadk_uid=LFhuCFSazPStSJkHCznw6ElY8PCHg6yu; trc_cookie_storage=taboola%2520global%253Auser-id%3D9b105f6b-121b-4260-af37-c882fe49ef70-tuct4c8c2be; _pk_id.100001.4cf6=36a255ce8300c1af.1578339377.1.1578339546.1578339377.'
c_dic = {}
for i in cookies.split('; '):
c_dic[i.split('=')[0]] = i.split('=')[1]
# 获取cookies
#######################################################################################
srclst = [] #获取图片信息
error_lis = []
for u in urllst: # urllst: 分页网址
try:
imgs = get_pic(u,h_dic,c_dic) #【数据采集】函数
srclst.extend(imgs)
print('图片src获取成功,总共获取%i条' % len(srclst))
except:
continue
#
#保存图片
n = 1
for src in srclst: #srclst: 所有图片信息列表
try:
save_pic(src)
print('图片采集成功,已采集%i张图片' % n)
n += 1
except:
continue
# 批量图片采集
停留在世界边缘,与之惜别