自己写的一个nhentai爬虫
目前只能单线程爬url地址的所有图片

# -*- coding=utf-8 import requests import os from bs4 import BeautifulSoup def mkdir(path): # 去除首位空格 path=path.strip() # 去除尾部 \ 符号 path=path.rstrip("\\") # 判断路径是否存在 # 存在 True # 不存在 False isExists=os.path.exists(path) # 判断结果 if not isExists: os.makedirs(path) print (path+' 创建成功') return True else: print (path+' 目录已存在') return False def trans(p): p=p.replace('|','') #replace()字符替换 old(旧字符串) 替换成 new(新字符串) p=p.replace('?','') p=p.replace('*','') p=p.replace('<','') p=p.replace('>','') p=p.replace('/','') p=p.replace('\\','') p=p.replace('"','') p=p.replace('\\','') p=p.replace(':','') return p headers = {'user-agent':'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'} url="https://nhentai.net/search/?q=doujinshi+full+color" resp = requests.get(url=url, headers=headers) resp.encoding = 'UTF-8' soup = BeautifulSoup(resp.text, 'html.parser') #抓取图片首页连接 for news in soup.select('.gallery'): a = news.select('a')[0]['href'] #取出class=gallery元素下的a标签的href url1="https://nhentai.net"+a resp1 = requests.get(url=url1, headers=headers) resp1.encoding = 'UTF-8' soup1 = BeautifulSoup(resp1.text, 'html.parser') #处理标题 try: dirname=soup1.select('#info h2')[0].text except IndexError: dirname=soup1.select('#info h1')[0].text # print("未找到中文标题 使用英文标题") # print(dirname) dirname=trans(dirname) mkpath="E:\\nhentai\\" + dirname + "\\" mkdir(mkpath) for news1 in soup1.select('.thumb-container'): b = news1.select('a')[0]['href'] #取出二级图片链接 img ="https://nhentai.net"+b #构造链接 resp2 = requests.get(url=img, headers=headers) resp2.encoding = 'UTF-8' soup2 = BeautifulSoup(resp2.text, 'html.parser') c = soup2.select('.fit-horizontal')[0]['src'] #取出图片链接 name = c.split("/")[-1] #split匹配出文件名 r = requests.get(url=c, headers=headers) with open(mkpath + name,'wb') as f: f.write(r.content)
如果觉得我的文章对你有用,请随意赞赏