浏览器引擎-Playwright工具封装基于Playwright封装,构建一个通用的浏览器引擎接口,涵盖常见的搜索引擎操
一、前言
Playwright 是一款强大的浏览器自动化测试框架,能够支持主流的浏览器(如Chrome、Firefox、Edge等)的跨平台自动化操作与 Selenium 功能类似。它不仅能执行精细的用户操作模拟,还能够精确抓取网页内容,因此被广泛应用于数据抓取、自动化测试、和搜索引擎模拟等场景。
基于Playwright封装,构建一个通用的浏览器引擎接口,涵盖常见的搜索引擎操作(如谷歌、百度、Bing)和网页解析功能。通过对这些功能进行抽象和封装,开发者能够轻松实现网页搜索与数据提取,简化日常开发工作中的网页交互与数据处理任务。
Playwright python 文档:playwright.dev/python/docs…
二、安装与使用
pip install playwirght
安装浏览器驱动
playwirght install
简单使用
headless=False
方式会启动playwright安装好的浏览器驱动,运行效果如下
注意:在一些无gui环境的操作系统下例如 linux 等,请把 headless 设置为 True
三、功能封装
常用搜索引擎封装
浏览器最常见的功能之一就是联网搜索信息。为了模拟这一过程,可以通过访问搜索引擎的入口实现自动化的搜索操作。本工具将封装常用的搜索引擎(如谷歌、百度、Bing),通过模拟用户输入和搜索,来获取对应的搜索结果。
谷歌搜索
import asyncio
from pprint import pprint
from playwright.async_api import async_playwright
from src.tools.schemas import LinkInfo, WebPage
class BrowserEngine:
def __init__(self, headless=True, timeout=10, **launch_kwargs):
self.headless = headless
self.playwright_engine = None
self.browser = None
self.launch_kwargs = launch_kwargs
self.timeout = timeout # unit seconds
async def launch_browser(self):
if self.browser is None:
self.playwright_engine = await async_playwright().start()
self.browser = await self.playwright_engine.chromium.launch(
headless=self.headless,
timeout=self.timeout * 1000, # unit ms
**self.launch_kwargs,
)
async def close_browser(self):
if self.browser:
await self.browser.close()
await self.playwright_engine.stop()
self.browser = None
async def google_search(self, query, max_results=8) -> list[LinkInfo]:
await self.launch_browser()
page = await self.browser.new_page()
async with page:
# 打开Google主页
await page.goto(f"https://www.google.com/search?q={query}")
# 输入搜索词并执行搜索
# await page.fill("textarea[name='q']", query)
# await page.press("textarea[name='q']", "Enter")
# 等待搜索结果页面加载
el_selector = "div.MjjYud"
await page.wait_for_selector(el_selector)
# 获取搜索结果的标题和链接
results = await page.query_selector_all(el_selector)
search_results = await self.get_search_results(results, max_results, engine_type="google")
return search_results
async def _parse_google_results(self, results, max_results=8) -> list[LinkInfo]:
search_results = []
for result in results:
try:
# 获取标题和链接
title_element = await result.query_selector("h3")
if not title_element:
continue # 如果找不到标题,跳过这个结果
title = await title_element.text_content()
url = await title_element.evaluate("el => el.parentElement.href")
# 获取快照内容
snapshot_element = await result.query_selector("div.VwiC3b")
snapshot = await snapshot_element.text_content() if snapshot_element else ""
if len(search_results) >= max_results:
break
search_results.append(LinkInfo(title=title, url=url, snapshot=snapshot))
except Exception as e:
print(e)
return search_results
async def get_search_results(self, results, max_results=8, engine_type="google"):
parse_method_mapping = {
"google": self._parse_google_results,
# "bing": self._parse_bing_results,
# "baidu": self._parse_baidu_results,
}
if engine_type not in parse_method_mapping:
raise ValueError(f"Engine type {engine_type} is not supported")
parse_method = parse_method_mapping[engine_type]
return await parse_method(results, max_results)
async def main():
engine = BrowserEngine(headless=True)
results = await engine.google_search("重试装饰器", max_results=5)
pprint(results)
await engine.close_browser()
if __name__ == "__main__":
asyncio.run(main())
搜索结果如下
封装思路如下
-
启动和关闭浏览器:首先封装了启动浏览器的逻辑,通过
launch_browser
方法启动浏览器实例,close_browser
方法则负责关闭浏览器并清理资源。 -
Google 搜索方法:
google_search
方法负责访问 Google 搜索引擎的入口,并模拟搜索行为。通过访问https://www.google.com/search?q={query}
,可以在 URL 中直接传递搜索关键词,避免在搜索框中输入和按键操作。 -
搜索结果解析:使用
get_search_results
方法,根据不同的搜索引擎选择对应的解析方法。针对 Google,使用_parse_google_results
方法提取搜索结果的标题、URL 和快照内容,返回LinkInfo
数据结构。 -
灵活扩展:使用
engine_type
参数支持未来扩展其他搜索引擎(如 Bing 或百度)
一开始我是在搜索框中输入和按键操作模拟搜索动作,但后面发现每个浏览器的入口元素都不一样写起来复杂,后面改成 get url的方式就简洁多了,这里重点介绍下 _parse_google_results
方法。
async def _parse_google_results(self, results, max_results=8) -> list[LinkInfo]:
search_results = []
for result in results:
try:
# 获取标题和链接
title_element = await result.query_selector("h3")
if not title_element:
continue # 如果找不到标题,跳过这个结果
title = await title_element.text_content()
url = await title_element.evaluate("el => el.parentElement.href")
# 获取快照内容
snapshot_element = await result.query_selector("div.VwiC3b")
snapshot = await snapshot_element.text_content() if snapshot_element else ""
if len(search_results) >= max_results:
break
search_results.append(LinkInfo(title=title, url=url, snapshot=snapshot))
except Exception as e:
print(e)
return search_results
results
参数表示通过搜索后筛选出的 HTML 元素,这里是利用了前端的元素选择器来筛选页面中的特定内容。以 div.MjjYud
为例,它表示所有带有 MjjYud
类选择器的 div
元素。
el_selector = "div.MjjYud"
results = await page.query_selector_all(el_selector)
可以打开浏览器的开发者工具进行网页调试分析
通过这种方式,我们可以快速定位搜索结果的容器,并进一步解析其中的标题、链接和快照内容。然后封装成 LinkInfo
数据对象
from pydantic import BaseModel
class LinkInfo(BaseModel):
title: str
url: str
snapshot: str
以此类推后面的百度、Bing 搜索都是这种逻辑进行封装,只是页面结构不同定位的元素不一样而已,所以我就以google搜索为例进行介绍,后面的就直接贴代码就不一一赘述了。
百度搜索
import asyncio
from playwright.async_api import async_playwright
from src.tools.schemas import LinkInfo, WebPage
class BrowserEngine:
def __init__(self, headless=True, timeout=10, **launch_kwargs):
self.headless = headless
self.playwright_engine = None
self.browser = None
self.launch_kwargs = launch_kwargs
self.timeout = timeout # unit seconds
async def launch_browser(self):
if self.browser is None:
self.playwright_engine = await async_playwright().start()
self.browser = await self.playwright_engine.chromium.launch(
headless=self.headless,
timeout=self.timeout * 1000, # unit ms
**self.launch_kwargs,
)
async def close_browser(self):
if self.browser:
await self.browser.close()
await self.playwright_engine.stop()
self.browser = None
async def baidu_search(self, query, max_results=8) -> list[LinkInfo]:
await self.launch_browser()
page = await self.browser.new_page()
async with page:
# 打开百度主页
await page.goto(f"https://www.baidu.com/s?wd={query}")
# 等待搜索结果页面加载
el_selector = "div.c-container"
await page.wait_for_selector(el_selector)
# 获取搜索结果的标题和链接
results = await page.query_selector_all(el_selector)
search_results = await self.get_search_results(results, max_results, engine_type="baidu")
return search_results
async def _parse_baidu_results(self, results, max_results=8) -> list[LinkInfo]:
search_results = []
for result in results:
try:
# 获取标题和链接
title_element = await result.query_selector("h3 a")
if not title_element:
continue # 如果找不到标题,跳过这个结果
title = await title_element.text_content()
url = await title_element.get_attribute("href")
# 尝试获取带封面图的快照内容
snapshot_element = await result.query_selector("div.c-span9 span.content-right_2s-H4")
snapshot = await snapshot_element.text_content() if snapshot_element else ""
# 如果没有封面图,尝试获取没有封面图的快照内容
if not snapshot:
snapshot_element = await result.query_selector("span.content-right_1THTn")
snapshot = await snapshot_element.text_content() if snapshot_element else ""
if len(search_results) >= max_results:
break
search_results.append(LinkInfo(title=title, url=url, snapshot=snapshot))
except Exception as e:
print(e)
return search_results
async def get_search_results(self, results, max_results=8, engine_type="google"):
parse_method_mapping = {
# "google": self._parse_google_results,
# "bing": self._parse_bing_results,
"baidu": self._parse_baidu_results,
}
if engine_type not in parse_method_mapping:
raise ValueError(f"Engine type {engine_type} is not supported")
parse_method = parse_method_mapping[engine_type]
return await parse_method(results, max_results)
async def main():
engine = BrowserEngine(headless=True)
await engine.baidu_search("python异步框架大战", max_results=5)
await engine.close_browser()
if __name__ == "__main__":
asyncio.run(main())
Bing 搜索
import asyncio
from playwright.async_api import async_playwright
from src.tools.schemas import LinkInfo, WebPage
class BrowserEngine:
def __init__(self, headless=True, timeout=10, **launch_kwargs):
self.headless = headless
self.playwright_engine = None
self.browser = None
self.launch_kwargs = launch_kwargs
self.timeout = timeout # unit seconds
async def launch_browser(self):
if self.browser is None:
self.playwright_engine = await async_playwright().start()
self.browser = await self.playwright_engine.chromium.launch(
headless=self.headless,
timeout=self.timeout * 1000, # unit ms
**self.launch_kwargs,
)
async def close_browser(self):
if self.browser:
await self.browser.close()
await self.playwright_engine.stop()
self.browser = None
async def bing_search(self, query, max_results=8) -> list[LinkInfo]:
await self.launch_browser()
page = await self.browser.new_page()
async with page:
# 打开bing主页
await page.goto(f"https://www.bing.com/search?q={query}")
# 等待搜索结果页面加载完毕
el_selector = "li.b_algo"
await page.wait_for_selector(el_selector) # 确保搜索结果的链接已经加载
# 获取搜索结果的标题和链接
results = await page.query_selector_all(el_selector)
search_results = await self.get_search_results(results, max_results, engine_type="bing")
return search_results
async def _parse_bing_results(self, results, max_results=8) -> list[LinkInfo]:
search_results = []
for result in results:
try:
# 获取标题和链接
title_element = await result.query_selector("h2 a")
if not title_element:
continue # 如果找不到标题,跳过这个结果
title = await title_element.text_content()
url = await title_element.get_attribute("href")
# 获取快照内容
snapshot_element = await result.query_selector("div.b_caption")
snapshot = await snapshot_element.text_content() if snapshot_element else ""
if len(search_results) >= max_results:
break
search_results.append(LinkInfo(title=title, url=url, snapshot=snapshot))
except Exception as e:
print(e)
return search_results
async def get_search_results(self, results, max_results=8, engine_type="google"):
parse_method_mapping = {
# "google": self._parse_google_results,
"bing": self._parse_bing_results,
# "baidu": self._parse_baidu_results,
}
if engine_type not in parse_method_mapping:
raise ValueError(f"Engine type {engine_type} is not supported")
parse_method = parse_method_mapping[engine_type]
return await parse_method(results, max_results)
async def main():
engine = BrowserEngine(headless=True)
await engine.bing_search("重试装饰器", max_results=5)
await engine.close_browser()
if __name__ == "__main__":
asyncio.run(main())
注意:这种浏览器搜索功能是不稳定的,因为数据是取决于网页渲染的结构,一旦发生改变则无法正确的获取数据,又要重新定位数据元素然后改代码。这里只是一个模拟的搜索功能封装。
网页内容获取
方法封装思路
数据对象 WebPage
如下
class WebPage(BaseModel):
url: str
content: str
inner_text: str
def __repr__(self):
return f"WebPage(url={self.url}, content={self.content[:20]}, inner_text={self.inner_text[:20]})"
测试效果
四、封装总结
playwright
工具封装主要实现了
常用搜索引擎(如 Google)的功能:能够获取搜索结果的标题、详情链接和快照内容。
获取网页内容:对于网页内容解析,支持通过链接获取完整网页内嵌文本内容、html内容,并支持多链接的并发处理。
未来,这两个功能可以与大语言模型结合,实现联网搜索、网页总结等更智能的功能,赋能大语言模型实时获取信息并生成精准的摘要。
五、源代码
AGI-Demo:AGI技术练习案例
Github:github.com/HuiDBK/AGI-…
转载自:https://juejin.cn/post/7410923898647101490