Python爬虫爬取笔趣阁小说.(利用笔趣阁引擎实现搜索不同小说)
前言
我将这个程序分为两个功能,一是实现爬取小说的最新章节,二是爬取小说的所有章节.
仅供学习.
获取小说详情页的html
通过函数gethtml()实现.
def gethtml(url):#得到小说详细页的html
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
}
try:
r = requests.get(url, headers=headers)
# time.sleep(5)
r.raise_for_status()
#print(r.request.headers)
r.encoding = r.apparent_encoding
return r.text
except:
return "产生异常1."
获取小说具体章节的URL
对于爬取所有章节和最新章节这两个功能来说,实现这个步骤分别用了两个函数 geturl_all()和 getnewtexturl() .
这两个函数用于分析爬取到了html,并分别获取所有章节的url和最新章节的url.
def geturl_all(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
#print(r.request.headers)
r.encoding = r.apparent_encoding
r=r.text
r=str(r)
url1 = BeautifulSoup(r, 'html.parser')
ans=url1.find_all("dd")
url_list=[]
#print(type(url_list))
for i in range(0,len(ans)):
url_list.append(re.findall(r"href=.*html",str(ans[i])))
return url_list
except:
return "产生异常2."
def getnewtexturl(url):#url为小说介绍页面url,得到小说最新章节的html.
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
#print(r.request.headers)
r.encoding = r.apparent_encoding
r=r.text
r=str(r)
url1 = BeautifulSoup(r, 'html.parser')
ans=url1.find_all("dd")
r=str(ans[-1])
# print(url1)
ans=re.findall(r"href=.*html",r)
print(ans)
return str(ans)
except:
return "产生异常2."
获取小说章节中的正文.
通过正则表达式来清洗数据,得到正文.
def find_text(html):#得到小说正文
texts=re.findall(r'id="content"(.*)',html)
texts=str(texts)
#tests=
texts=re.split(r"\\r<br />\\r<br /> ",texts)
#print(texts)
ans=" "
for i in range(0,len(texts)):
ans+=" "+texts[i]+"\n"
return ans
全部代码:
import requests
import os
from bs4 import BeautifulSoup
import re
import time
def gethtml(url):#得到小说详细页的html
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
}
try:
r = requests.get(url, headers=headers)
# time.sleep(5)
r.raise_for_status()
#print(r.request.headers)
r.encoding = r.apparent_encoding
return r.text
except:
return "产生异常1."
def geturl_all(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
#print(r.request.headers)
r.encoding = r.apparent_encoding
r=r.text
r=str(r)
url1 = BeautifulSoup(r, 'html.parser')
ans=url1.find_all("dd")
url_list=[]
#print(type(url_list))
for i in range(0,len(ans)):
url_list.append(re.findall(r"href=.*html",str(ans[i])))
return url_list
except:
return "产生异常2."
def getnewtexturl(url):#url为小说介绍页面url,得到小说最新章节的html.
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
#print(r.request.headers)
r.encoding = r.apparent_encoding
r=r.text
r=str(r)
url1 = BeautifulSoup(r, 'html.parser')
ans=url1.find_all("dd")
r=str(ans[-1])
# print(url1)
ans=re.findall(r"href=.*html",r)
print(ans)
return str(ans)
except:
return "产生异常2."
def find_text(html):#得到小说正文
texts=re.findall(r'id="content"(.*)',html)
texts=str(texts)
#tests=
texts=re.split(r"\\r<br />\\r<br /> ",texts)
#print(texts)
ans=" "
for i in range(0,len(texts)):
ans+=" "+texts[i]+"\n"
return ans
moshi=input("1.爬取全文.\n2.爬取最新章节.\n")
def hind_all():
r= open("小说全部章节.text", 'w+', encoding="utf-8")
url = "https://www.xbiquge.la/82/82620/" # 书籍详细介绍页.
url_list =geturl_all(url)
# print(url_list)
for i in range(0,len(url_list)):
print("正在爬取中,请稍候当前在",i,"章节")
#print(url_list[i])
url=str(url_list[i])
url = "https://www.xbiquge.la" + url[8:-2]
html = gethtml(url) ##得到小说最新章节html
text = find_text(html) # 分析出小说正文.
r.write(text)
r.close()
print("爬取成功.")
def hind_last():
url = "https://www.xbiquge.la/82/82620/" # 书籍详细介绍页.
url = getnewtexturl(url) # 得到小说最新章节url
print(url)
url = "https://www.xbiquge.la" + url[8:-2]
# print(url)
html = gethtml(url) ##得到小说最新章节html
# print(html)
# print(type(html))
text = find_text(html) # 分析出小说正文.
# print(text)
with open("小说最新章节.text", 'w+', encoding="utf-8") as f:
f.write(text)
f.close()
if moshi=="1":
hind_all();
else:
hind_last();
更新:
隔了一段时间,感觉原功能还是太简陋,想要爬小说,必须要给定小说的介绍页面的url.所以我又优化了一下程序,新增了几个函数,来实现类似搜索引擎的功能.
首先模拟浏览器发出post请求,去请求你想要搜索的小说的名字.利用以下函数来实现.
def findbook(url):#得到小说详细页的html
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
}
name=input("请输入你想要搜索的小说名字:")
keyword={
" searchkey": name
}
try:
# r = requests.get(url, headers=headers,keyword=keyword)
r=requests.post(url=url,data=keyword,headers=headers)
r.encoding = r.apparent_encoding
return r.text
except:
return "搜索框错误."
然后进行数据清洗,在浏览器给出的响应信息页面中找出搜索结果,找出搜索到的小说和其介绍页面的url.
def findbookurl(html):
soup = BeautifulSoup(html, 'html.parser')
soup=soup.find_all("td",attrs={"class":"even"})
soup=str(soup)
listurl=soup.split("<td")
urllist=[]
for i in range(0,len(listurl)):
data=re.findall(r"https.*</a>",listurl[i])#+re.findall(r">.*</a>",listurl[i])
# urllist[i]=listurl[i][22:]
if len(data)==0:
continue
# print(str(data))
data=str(data)
urllist.append(data)
#print(listurl[i])
return urllist
返回格式是一个列表,列表的每个元素里边记录了每个搜索到的小说的url和书名.
然后输入想要爬取第几本书.
listurl=findbookurl(text)
print(listurl)
num=input("你想要爬取第几本小说?")
urlnum=listurl[int(num)-1].find("\"")
#print(listurl[int(num)-1][2:urlnum])
url=listurl[int(num)-1][2:urlnum]
namenumstar=listurl[int(num)-1].find(">")
namenumend=listurl[int(num)-1].find("<")
#print(listurl[int(num)-1][namenumstar+1:namenumend])
name=listurl[int(num)-1][namenumstar+1:namenumend]
接下来就是将其url传入之前写的函数里,和之前功能相仿.最后依旧是将小说内容存入TXT文档里.
完整代码:
import requests
import os
from bs4 import BeautifulSoup
import re
import time
import 爬取笔趣阁
import pa
def findbook(url):#得到小说详细页的html
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
}
name=input("请输入你想要搜索的小说名字:")
keyword={
" searchkey": name
}
try:
# r = requests.get(url, headers=headers,keyword=keyword)
r=requests.post(url=url,data=keyword,headers=headers)
r.encoding = r.apparent_encoding
return r.text
except:
return "搜索框错误."
def findbookurl(html):
soup = BeautifulSoup(html, 'html.parser')
soup=soup.find_all("td",attrs={"class":"even"})
soup=str(soup)
listurl=soup.split("<td")
urllist=[]
for i in range(0,len(listurl)):
data=re.findall(r"https.*</a>",listurl[i])#+re.findall(r">.*</a>",listurl[i])
# urllist[i]=listurl[i][22:]
if len(data)==0:
continue
# print(str(data))
data=str(data)
urllist.append(data)
#print(listurl[i])
return urllist
def gethtml(url):#得到小说详细页的html
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
}
try:
r = requests.get(url, headers=headers)
# time.sleep(5)
r.raise_for_status()
#print(r.request.headers)
r.encoding = r.apparent_encoding
return r.text
except:
return "产生异常1."
def geturl_all(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
#print(r.request.headers)
r.encoding = r.apparent_encoding
r=r.text
r=str(r)
url1 = BeautifulSoup(r, 'html.parser')
ans=url1.find_all("dd")
url_list=[]
#print(type(url_list))
for i in range(0,len(ans)):
url_list.append(re.findall(r"href=.*html",str(ans[i])))
return url_list
except:
return "产生异常2."
def getnewtexturl(url):#url为小说介绍页面url,得到小说最新章节的html.
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
}
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
#print(r.request.headers)
r.encoding = r.apparent_encoding
r=r.text
r=str(r)
url1 = BeautifulSoup(r, 'html.parser')
ans=url1.find_all("dd")
r=str(ans[-1])
# print(url1)
ans=re.findall(r"href=.*html",r)
print(ans)
return str(ans)
except:
return "产生异常2."
def find_text(html):#得到小说正文
texts=re.findall(r'id="content"(.*)',html)
texts=str(texts)
#tests=
texts=re.split(r"\\r<br />\\r<br /> ",texts)
#print(texts)
ans=" "
for i in range(0,len(texts)):
ans+=" "+texts[i]+"\n"
return ans
def hind_all(url):
r= open(name+"全部章节.txt", 'w+', encoding="utf-8")
url_list =geturl_all(url)
# print(url_list)
for i in range(0,len(url_list)):
print("正在爬取中,请稍候当前在",i,"章节")
#print(url_list[i])
url=str(url_list[i])
url = "https://www.xbiquge.la" + url[8:-2]
html = gethtml(url) ##得到小说最新章节html
text = find_text(html) # 分析出小说正文.
r.write(text)
r.close()
print("爬取成功.")
def hind_last(url):
url = getnewtexturl(url) # 得到小说最新章节url
print(url)
url = "https://www.xbiquge.la" + url[8:-2]
# print(url)
html = gethtml(url) ##得到小说最新章节html
# print(html)
# print(type(html))
text = find_text(html) # 分析出小说正文.
# print(text)
with open(name+"最新章节.txt", 'w+', encoding="utf-8") as f:
f.write(text)
f.close()
def ma(url):
moshi=input("1.爬取全文.\n2.爬取最新章节.\n")
# url = "https://www.xbiquge.la/41/41051/"
if moshi == "1":
hind_all(url);
else:
hind_last(url);
url="https://www.xbiquge.la/modules/article/waps.php"
text=findbook(url)
listurl=findbookurl(text)
print(listurl)
num=input("你想要爬取第几本小说?")
urlnum=listurl[int(num)-1].find("\"")
#print(listurl[int(num)-1][2:urlnum])
url=listurl[int(num)-1][2:urlnum]
namenumstar=listurl[int(num)-1].find(">")
namenumend=listurl[int(num)-1].find("<")
#print(listurl[int(num)-1][namenumstar+1:namenumend])
name=listurl[int(num)-1][namenumstar+1:namenumend]
ma(url)
转载自:https://juejin.cn/post/7085228383088427022