likes
comments
collection
share

Python爬虫爬取笔趣阁小说.(利用笔趣阁引擎实现搜索不同小说)

作者站长头像
站长
· 阅读数 17

前言

我将这个程序分为两个功能,一是实现爬取小说的最新章节,二是爬取小说的所有章节.

仅供学习.

获取小说详情页的html

通过函数gethtml()实现.

def gethtml(url):#得到小说详细页的html
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
 
        r = requests.get(url, headers=headers)
      #  time.sleep(5)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "产生异常1."

获取小说具体章节的URL

对于爬取所有章节和最新章节这两个功能来说,实现这个步骤分别用了两个函数 geturl_all()和 getnewtexturl() .

这两个函数用于分析爬取到了html,并分别获取所有章节的url和最新章节的url.

def geturl_all(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
        r = requests.get(url,  headers=headers)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        r=r.text
        r=str(r)
        url1 = BeautifulSoup(r, 'html.parser')
        ans=url1.find_all("dd")
        url_list=[]
        #print(type(url_list))
        for i in range(0,len(ans)):
            url_list.append(re.findall(r"href=.*html",str(ans[i])))
        return url_list
    except:
        return "产生异常2."
 
def getnewtexturl(url):#url为小说介绍页面url,得到小说最新章节的html.
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
        r = requests.get(url,  headers=headers)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        r=r.text
        r=str(r)
        url1 = BeautifulSoup(r, 'html.parser')
        ans=url1.find_all("dd")
        r=str(ans[-1])
       # print(url1)
        ans=re.findall(r"href=.*html",r)
        print(ans)
        return str(ans)
    except:
        return "产生异常2."

获取小说章节中的正文.

通过正则表达式来清洗数据,得到正文.

def find_text(html):#得到小说正文
    texts=re.findall(r'id="content"(.*)',html)
    texts=str(texts)
    #tests=
    texts=re.split(r"\\r<br />\\r<br />&nbsp;&nbsp;&nbsp;&nbsp;",texts)
    #print(texts)
    ans=" "
    for i in range(0,len(texts)):
        ans+="  "+texts[i]+"\n"
    return ans

全部代码:

import  requests
import  os
from bs4 import BeautifulSoup
import  re
import  time
 
def gethtml(url):#得到小说详细页的html
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
 
        r = requests.get(url, headers=headers)
      #  time.sleep(5)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "产生异常1."
 
def geturl_all(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
        r = requests.get(url,  headers=headers)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        r=r.text
        r=str(r)
        url1 = BeautifulSoup(r, 'html.parser')
        ans=url1.find_all("dd")
        url_list=[]
        #print(type(url_list))
        for i in range(0,len(ans)):
            url_list.append(re.findall(r"href=.*html",str(ans[i])))
        return url_list
    except:
        return "产生异常2."
 
def getnewtexturl(url):#url为小说介绍页面url,得到小说最新章节的html.
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
        r = requests.get(url,  headers=headers)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        r=r.text
        r=str(r)
        url1 = BeautifulSoup(r, 'html.parser')
        ans=url1.find_all("dd")
        r=str(ans[-1])
       # print(url1)
        ans=re.findall(r"href=.*html",r)
        print(ans)
        return str(ans)
    except:
        return "产生异常2."
 
def find_text(html):#得到小说正文
    texts=re.findall(r'id="content"(.*)',html)
    texts=str(texts)
    #tests=
    texts=re.split(r"\\r<br />\\r<br />&nbsp;&nbsp;&nbsp;&nbsp;",texts)
    #print(texts)
    ans=" "
    for i in range(0,len(texts)):
        ans+="  "+texts[i]+"\n"
    return ans
 
moshi=input("1.爬取全文.\n2.爬取最新章节.\n")
def hind_all():
    r= open("小说全部章节.text", 'w+', encoding="utf-8")
    url = "https://www.xbiquge.la/82/82620/"  # 书籍详细介绍页.
    url_list =geturl_all(url)
   # print(url_list)
    for i in range(0,len(url_list)):
        print("正在爬取中,请稍候当前在",i,"章节")
        #print(url_list[i])
        url=str(url_list[i])
        url = "https://www.xbiquge.la" + url[8:-2]
        html = gethtml(url)  ##得到小说最新章节html
        text = find_text(html)  # 分析出小说正文.
        r.write(text)
    r.close()
    print("爬取成功.")
 
def hind_last():
    url = "https://www.xbiquge.la/82/82620/"  # 书籍详细介绍页.
    url = getnewtexturl(url)  # 得到小说最新章节url
    print(url)
    url = "https://www.xbiquge.la" + url[8:-2]
    # print(url)
    html = gethtml(url)  ##得到小说最新章节html
    # print(html)
    # print(type(html))
    text = find_text(html)  # 分析出小说正文.
    # print(text)
    with open("小说最新章节.text", 'w+', encoding="utf-8") as f:
        f.write(text)
    f.close()
if moshi=="1":
    hind_all();
else:
    hind_last();

更新:

 隔了一段时间,感觉原功能还是太简陋,想要爬小说,必须要给定小说的介绍页面的url.所以我又优化了一下程序,新增了几个函数,来实现类似搜索引擎的功能.

首先模拟浏览器发出post请求,去请求你想要搜索的小说的名字.利用以下函数来实现.

def findbook(url):#得到小说详细页的html
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    name=input("请输入你想要搜索的小说名字:")
    keyword={
      " searchkey": name
    }
    try:
       # r = requests.get(url, headers=headers,keyword=keyword)
        r=requests.post(url=url,data=keyword,headers=headers)
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "搜索框错误."

然后进行数据清洗,在浏览器给出的响应信息页面中找出搜索结果,找出搜索到的小说和其介绍页面的url.

def findbookurl(html):
    soup = BeautifulSoup(html, 'html.parser')
    soup=soup.find_all("td",attrs={"class":"even"})
    soup=str(soup)
    listurl=soup.split("<td")
    urllist=[]
    for i in range(0,len(listurl)):
        data=re.findall(r"https.*</a>",listurl[i])#+re.findall(r">.*</a>",listurl[i])
       # urllist[i]=listurl[i][22:]
        if len(data)==0:
            continue
       # print(str(data))
        data=str(data)
        urllist.append(data)
        #print(listurl[i])
    return  urllist

返回格式是一个列表,列表的每个元素里边记录了每个搜索到的小说的url和书名.

Python爬虫爬取笔趣阁小说.(利用笔趣阁引擎实现搜索不同小说) 然后输入想要爬取第几本书.

listurl=findbookurl(text)
print(listurl)
num=input("你想要爬取第几本小说?")
urlnum=listurl[int(num)-1].find("\"")
#print(listurl[int(num)-1][2:urlnum])
url=listurl[int(num)-1][2:urlnum]
namenumstar=listurl[int(num)-1].find(">")
namenumend=listurl[int(num)-1].find("<")
#print(listurl[int(num)-1][namenumstar+1:namenumend])
name=listurl[int(num)-1][namenumstar+1:namenumend]

Python爬虫爬取笔趣阁小说.(利用笔趣阁引擎实现搜索不同小说)

接下来就是将其url传入之前写的函数里,和之前功能相仿.最后依旧是将小说内容存入TXT文档里.

Python爬虫爬取笔趣阁小说.(利用笔趣阁引擎实现搜索不同小说)

 完整代码:

import  requests
import  os
from bs4 import BeautifulSoup
import  re
import  time
import 爬取笔趣阁
import pa
def findbook(url):#得到小说详细页的html
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    name=input("请输入你想要搜索的小说名字:")
    keyword={
      " searchkey": name
    }
    try:
       # r = requests.get(url, headers=headers,keyword=keyword)
        r=requests.post(url=url,data=keyword,headers=headers)
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "搜索框错误."
 
def findbookurl(html):
    soup = BeautifulSoup(html, 'html.parser')
    soup=soup.find_all("td",attrs={"class":"even"})
    soup=str(soup)
    listurl=soup.split("<td")
    urllist=[]
    for i in range(0,len(listurl)):
        data=re.findall(r"https.*</a>",listurl[i])#+re.findall(r">.*</a>",listurl[i])
       # urllist[i]=listurl[i][22:]
        if len(data)==0:
            continue
       # print(str(data))
        data=str(data)
        urllist.append(data)
        #print(listurl[i])
    return  urllist
 
def gethtml(url):#得到小说详细页的html
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
 
        r = requests.get(url, headers=headers)
      #  time.sleep(5)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "产生异常1."
 
def geturl_all(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
        r = requests.get(url,  headers=headers)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        r=r.text
        r=str(r)
        url1 = BeautifulSoup(r, 'html.parser')
        ans=url1.find_all("dd")
        url_list=[]
        #print(type(url_list))
        for i in range(0,len(ans)):
            url_list.append(re.findall(r"href=.*html",str(ans[i])))
        return url_list
    except:
        return "产生异常2."
 
def getnewtexturl(url):#url为小说介绍页面url,得到小说最新章节的html.
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36 Edg/98.0.1108.56"
    }
    try:
        r = requests.get(url,  headers=headers)
        r.raise_for_status()
        #print(r.request.headers)
        r.encoding = r.apparent_encoding
        r=r.text
        r=str(r)
        url1 = BeautifulSoup(r, 'html.parser')
        ans=url1.find_all("dd")
        r=str(ans[-1])
       # print(url1)
        ans=re.findall(r"href=.*html",r)
        print(ans)
        return str(ans)
    except:
        return "产生异常2."
 
def find_text(html):#得到小说正文
    texts=re.findall(r'id="content"(.*)',html)
    texts=str(texts)
    #tests=
    texts=re.split(r"\\r<br />\\r<br />&nbsp;&nbsp;&nbsp;&nbsp;",texts)
    #print(texts)
    ans=" "
    for i in range(0,len(texts)):
        ans+="  "+texts[i]+"\n"
    return ans
 
 
 
def hind_all(url):
    r= open(name+"全部章节.txt", 'w+', encoding="utf-8")
    url_list =geturl_all(url)
   # print(url_list)
    for i in range(0,len(url_list)):
        print("正在爬取中,请稍候当前在",i,"章节")
        #print(url_list[i])
        url=str(url_list[i])
        url = "https://www.xbiquge.la" + url[8:-2]
        html = gethtml(url)  ##得到小说最新章节html
        text = find_text(html)  # 分析出小说正文.
        r.write(text)
    r.close()
    print("爬取成功.")
 
def hind_last(url):
    url = getnewtexturl(url)  # 得到小说最新章节url
    print(url)
    url = "https://www.xbiquge.la" + url[8:-2]
    # print(url)
    html = gethtml(url)  ##得到小说最新章节html
    # print(html)
    # print(type(html))
    text = find_text(html)  # 分析出小说正文.
    # print(text)
    with open(name+"最新章节.txt", 'w+', encoding="utf-8") as f:
        f.write(text)
    f.close()
 
def ma(url):
    moshi=input("1.爬取全文.\n2.爬取最新章节.\n")
   # url = "https://www.xbiquge.la/41/41051/"
    if moshi == "1":
        hind_all(url);
    else:
        hind_last(url);
 
url="https://www.xbiquge.la/modules/article/waps.php"
text=findbook(url)
listurl=findbookurl(text)
print(listurl)
num=input("你想要爬取第几本小说?")
urlnum=listurl[int(num)-1].find("\"")
#print(listurl[int(num)-1][2:urlnum])
url=listurl[int(num)-1][2:urlnum]
namenumstar=listurl[int(num)-1].find(">")
namenumend=listurl[int(num)-1].find("<")
#print(listurl[int(num)-1][namenumstar+1:namenumend])
name=listurl[int(num)-1][namenumstar+1:namenumend]
ma(url)