请注意,本文编写于 2208 天前,最后修改于 2208 天前,其中某些信息可能已经过时。
自己写的一个nhentai爬虫
目前只能单线程爬url地址的所有图片
# -*- coding=utf-8
import requests
import os
from bs4 import BeautifulSoup
def mkdir(path):
# 去除首位空格
path=path.strip()
# 去除尾部 \ 符号
path=path.rstrip("\\")
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists=os.path.exists(path)
# 判断结果
if not isExists:
os.makedirs(path)
print (path+' 创建成功')
return True
else:
print (path+' 目录已存在')
return False
def trans(p):
p=p.replace('|','') #replace()字符替换 old(旧字符串) 替换成 new(新字符串)
p=p.replace('?','')
p=p.replace('*','')
p=p.replace('<','')
p=p.replace('>','')
p=p.replace('/','')
p=p.replace('\\','')
p=p.replace('"','')
p=p.replace('\\','')
p=p.replace(':','')
return p
headers = {'user-agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
url="https://nhentai.net/search/?q=doujinshi+full+color"
resp = requests.get(url=url, headers=headers)
resp.encoding = 'UTF-8'
soup = BeautifulSoup(resp.text, 'html.parser')
#抓取图片首页连接
for news in soup.select('.gallery'):
a = news.select('a')[0]['href'] #取出class=gallery元素下的a标签的href
url1="https://nhentai.net"+a
resp1 = requests.get(url=url1, headers=headers)
resp1.encoding = 'UTF-8'
soup1 = BeautifulSoup(resp1.text, 'html.parser')
#处理标题
try:
dirname=soup1.select('#info h2')[0].text
except IndexError:
dirname=soup1.select('#info h1')[0].text
# print("未找到中文标题 使用英文标题")
# print(dirname)
dirname=trans(dirname)
mkpath="E:\\nhentai\\" + dirname + "\\"
mkdir(mkpath)
for news1 in soup1.select('.thumb-container'):
b = news1.select('a')[0]['href'] #取出二级图片链接
img ="https://nhentai.net"+b #构造链接
resp2 = requests.get(url=img, headers=headers)
resp2.encoding = 'UTF-8'
soup2 = BeautifulSoup(resp2.text, 'html.parser')
c = soup2.select('.fit-horizontal')[0]['src'] #取出图片链接
name = c.split("/")[-1] #split匹配出文件名
r = requests.get(url=c, headers=headers)
with open(mkpath + name,'wb') as f:
f.write(r.content)