Freebase数据库搭建及Python调用
本地Freebase数据库搭建及Python查询调用
Requirements
- OpenLink Virtuoso 7.2.11
- Python (SPARQLWrapper, urllib)
Setup
Freebase data dump
提供网盘数据下载
https://pan.baidu.com/s/1poRzNg2xSYTF1DQEfvmEDg 提取码: aeak
OpenLink Virtuoso
可以通过下面的链接下载免安装版的,也可在在链接进行源码安装
https://github.com/openlink/virtuoso-opensource/releases/tag/v7.2.11
Managing the Virtuoso service
脚本链接:https://github.com/dki-lab/Freebase-Setup
可先参考本文Questions章节修改脚本
提供了一个用于管理Virtuoso服务的包装器脚本( virtuoso.py
)。要使用它,首先将脚本中的 virtuosoPath
更改为本地Virtuoso目录。假设Virtuoso db文件位于与脚本 virtuoso.py
相同的目录下名为 virtuoso_db
的目录中,并且3001是服务的预期HTTP端口,要启动Virtuoso服务:
python virtuoso.py start 3001 -d virtuoso_db
停止同一端口上当前正在运行的服务:
python virtuoso.py stop 3001
建议使用至少具有100 GB RAM的服务器。您可以通过提供的脚本调整服务可能使用的最大RAM量和其他配置。
运行成功之后界面输出如下
在浏览器界面输入localhost:3001
出现以下界面表示运行成功
Python调用数据库
安装python库
pip install SPARQLWrapper
pip install urllib
可通过python对本地数据库进行查询调用
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : freebase.py
# @Author : AZ
# @Time : 2023/9/12 下午9:11
from SPARQLWrapper import SPARQLWrapper, JSON
from urllib.parse import urlparse
sparql = SPARQLWrapper('http://localhost:3001/sparql')
'''
查询实体对应的 [名称]
'''
# entity = "m.01tm_5"
# sparql_query = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>"\
# "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>"\
# "PREFIX : <http://rdf.freebase.com/ns/>"\
# "SELECT DISTINCT ?name where {" \
# f"VALUES ?x0 {{:{entity}}} " \
# "?x0 :type.object.name ?name " \
# "FILTER(LANGMATCHES(LANG(?name), 'en'))." \
# "}"
'''
查询头实体对应的 [关系,尾节点MID,尾节点名称]
'''
# entity = "m.07tczt"
# sparql_query = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " \
# "PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " \
# "PREFIX : <http://rdf.freebase.com/ns/> " \
# "SELECT DISTINCT ?ns ?x0 ?name WHERE { " \
# f"VALUES ?x3 {{:{entity}}} " \
# "?x0 ?ns ?x3 . " \
# "?x0 :type.object.name ?name " \
# "FILTER ( ?x0 != ?x3 )" \
# "FILTER(LANGMATCHES(LANG(?name), 'en'))." \
# "}"
'''
查询关系对应的 [头节点MID,头节点名称,尾节点MID,尾节点名称]
'''
relation = 'spaceflight.rocket_engine_manufacturer.rocket_engines'
# entity = 'm.0dl93'
# f"VALUES ?x0 {{:{entity}}} " \
sparql_query = "PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> " \
"PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> " \
"PREFIX : <http://rdf.freebase.com/ns/> " \
"SELECT DISTINCT ?x0 ?name0 ?x1 ?name1 WHERE { " \
f"?x0 :{relation} ?x1." \
"FILTER ( ?x0 != ?x1 )" \
"?x1 :type.object.name ?name1 " \
"FILTER(LANGMATCHES(LANG(?name1), 'en'))" \
"?x0 :type.object.name ?name0 " \
"FILTER(LANGMATCHES(LANG(?name0), 'en'))" \
"}"
sparql.setQuery(sparql_query)
sparql.setReturnFormat(JSON)
# results = sparql.query().convert() # json,type为dict
# print(results['results']['bindings'])
try:
ret = sparql.queryAndConvert()
# print(ret)
for res in ret["results"]["bindings"]:
ans = [urlparse(res[k]['value']).path for k in res]
print(f"{ans}")
# AnswerArgument = urlparse(r['value']['value']).path
# AnswerArgument = AnswerArgument.split('/')[2]
# print(AnswerArgument)
except Exception as e:
print(e)
输出结果
['/ns/m.03fkyw', 'SpaceX', '/ns/m.03g0k4', 'Merlin']
['/ns/m.03fkyw', 'SpaceX', '/ns/m.03g0sk', 'Kestrel']
['/ns/m.01vhl6', 'Rocketdyne', '/ns/m.032blg', 'Space Shuttle main engine']
['/ns/m.01vhl6', 'Rocketdyne', '/ns/m.034_44', 'Rocketdyne F-1']
['/ns/m.01vhl6', 'Rocketdyne', '/ns/m.043jfp', 'RS-68']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlty_', 'RD-250']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltwn', 'RD-250PM']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.07dk1d', 'RD-180']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlt61', 'RD-550']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlt6v', 'RD-512']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlt7j', 'RD-511']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlt87', 'RD-510']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlt9t', 'RD-502']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvw0', 'RD-161P']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltxc', 'RD-250M']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwf0', 'RD-117']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvp4', 'RD-2']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvqg', 'RD-1']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltjy', 'RD-270']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02wts37', 'RD-107']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwpz', 'RD-108']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwqt', 'RD-107A']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvdf', 'RD-215U']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.07ffmj', 'RD-170']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlty5', 'RD-250P']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlv5y', 'RD-218U']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltbl', 'RD-350']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltc9', 'RD-303']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltd2', 'RD-302']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltdw', 'RD-301']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlv54', 'RD-219']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvgs', 'RD-214U']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvpt', 'RD-1KhZ']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlth7', 'RD-274']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltgf', 'RD-275']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvb0', 'RD-216U']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw89', 'RD-123']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwlp', 'RD-110']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlv95', 'RD-216M']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwcf', 'RD-119']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlt_j', 'RD-224D']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltj2', 'RD-273']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltks', 'RD-268']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltlm', 'RD-264']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltmg', 'RD-263']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltn9', 'RD-262']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltp2', 'RD-261']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltpy', 'RD-254']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltqp', 'RD-253F']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltrh', 'RD-253']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltsb', 'RD-252']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltt3', 'RD-251M']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltvt', 'RD-251']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltzt', 'RD-225']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlv09', 'RD-224']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlv14', 'RD-223']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlv1y', 'RD-222']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlv2t', 'RD-221']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlv3k', 'RD-220']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlv4c', 'RD-219U']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlv6s', 'RD-218']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlv7m', 'RD-217U']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvbw', 'RD-216']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvcq', 'RD-215M']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvf4', 'RD-215']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwgn', 'RD-115']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwhf', 'RD-114']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwj8', 'RD-113']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwk0', 'RD-112']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwmh', 'RD-109']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlttz', 'RD-251P']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvnk', 'RD-3']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw1q', 'RD-134']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvsj', 'RD-172']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvv5', 'RD-171']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvwq', 'RD-161']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw9x', 'RD-120K']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlt8_', 'RD-503']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xltfn', 'RD-280']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlv_b', 'RD-141']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvfy', 'RD-214F']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvhm', 'RD-214']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvjg', 'RD-213 (D-13)']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvk8', 'RD-212 (D-41)']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvl3', 'RD-211']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvlz', 'RD-210']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvms', 'RD-200']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvr3', 'RD-191']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvtc', 'RD-171M']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvxg', 'RD-150']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvy6', 'RD-146']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvyz', 'RD-143']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlvzl', 'RD-142']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw03', 'RD-136']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw0x', 'RD-135']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw2g', 'RD-133']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw36', 'RD-130']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw3z', 'RD-129']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw4p', 'RD-128']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw5d', 'RD-127']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw63', 'RD-126']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw6v', 'RD-125']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw7j', 'RD-124']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlw93', 'RD-121']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwd6', 'RD-118']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwft', 'RD-116']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwkv', 'RD-111']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwn8', 'RD-108A']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwp3', 'RD-108MM']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwrn', 'RD-107MM']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlws_', 'RD-106']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwtv', 'RD-105']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwvp', 'RD-103M']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwwh', 'RD-103']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.02xlwxb', 'RD-101']
['/ns/m.02p_3s6', 'NPO Energomash', '/ns/m.03jz69l', 'RD-100']
['/ns/m.05fj01', 'Thiokol', '/ns/m.03nqqz', 'Reaction Motors XLR99']
['/ns/m.0dl93', 'Pratt & Whitney', '/ns/m.07tczt', 'RS-27A']
['/ns/m.0dl93', 'Pratt & Whitney', '/ns/m.0351y6', 'RL10']
['/ns/m.06pwfx', 'Kuznetsov Design Bureau', '/ns/m.0b19_q', 'NK-33']
['/ns/m.080jfm1', 'Reaction Motors', '/ns/m.02nc08', 'Reaction Motors XLR11']
['/ns/m.0511xg', 'Yuzhmash', '/ns/m.02xlwbn', 'RD-120']
['/ns/m.03c_441', 'Chemical Automatics Design Bureau', '/ns/m.07kfw0m', 'RD-0146']
Questions
搭建过程中在网页显示的步骤出现了一些问题,需要对脚本进行修改,主要修改点是增加了页面缓存文件的路径
"ServerRoot = /home/name/Freebase/virtuoso-opensource/vsp\n"
virtuoso.py 脚本修改
# This script provides a convenient wrapper for the Virtuoso SPARQL server.
# Adapted from Sempre (https://github.com/percyliang/sempre)
import os
import sys
import subprocess
import argparse
virtuosoPath = "/usr/local/virtuoso-opensource"
if not os.path.exists(virtuosoPath):
print(f"{virtuosoPath} does not exist")
sys.exit(1)
# Virtuoso has two services: the server (isql) and SPARQL endpoint
def isqlPort(port): return 10000 + port
def httpPort(port): return port
def run(command):
print(f"RUNNING: {command}")
res = subprocess.run(command, shell=True, stdout=subprocess.PIPE)
return res.stdout
def start(dbPath, port):
if not os.path.exists(dbPath):
os.mkdir(dbPath)
# Recommended: 70% of RAM, each buffer is 8K
# Use a fraction of the free RAM. The result may vary across runs.
# memFree = parseInt(`cat /proc/meminfo | grep MemFree | awk '{print $2}'`) # KB
# Use a fraction of the total RAM. The result is the same across runs.
memFree = int(run("cat /proc/meminfo | grep MemTotal | awk '{print $2}'")) # KB
numberOfBuffers = memFree * 0.15 / 8
maxDirtyBuffers = numberOfBuffers / 2
print(f"{memFree} KB free, using {numberOfBuffers} buffers, {maxDirtyBuffers} dirty buffers")
# Configuration options:
# http://docs.openlinksw.com/virtuoso/dbadm.html
# http://virtuoso.openlinksw.com/dataspace/doc/dav/wiki/Main/VirtConfigScale
config = (
f"[Database]\n"
f"DatabaseFile = {dbPath}/virtuoso.db\n"
f"ErrorLogFile = {dbPath}/virtuoso.log\n"
f"LockFile = {dbPath}/virtuoso.lck\n"
f"TransactionFile = {dbPath}/virtuoso.trx\n"
f"xa_persistent_file = {dbPath}/virtuoso.pxa\n"
f"ErrorLogLevel = 7\n"
f"FileExtend = 200\n"
f"MaxCheckpointRemap = 2000\n"
f"Striping = 0\n"
f"TempStorage = TempDatabase\n"
f"\n"
f"[TempDatabase]\n"
f"DatabaseFile = {dbPath}/virtuoso-temp.db\n"
f"TransactionFile = {dbPath}/virtuoso-temp.trx\n"
f"MaxCheckpointRemap = 2000\n"
f"Striping = 0\n"
f"\n"
f"[Parameters]\n"
f"ServerPort = {isqlPort(port)}\n"
f"LiteMode = 0\n"
f"DisableUnixSocket = 1\n"
f"DisableTcpSocket = 0\n"
f"ServerThreads = 100 ; increased from 20\n"
f"CheckpointInterval = 60\n"
f"O_DIRECT = 1 ; increased from 0\n"
f"CaseMode = 2\n"
f"MaxStaticCursorRows = 100000\n"
f"CheckpointAuditTrail = 0\n"
f"AllowOSCalls = 0\n"
f"SchedulerInterval = 10\n"
f"DirsAllowed = .\n"
f"ThreadCleanupInterval = 0\n"
f"ThreadThreshold = 10\n"
f"ResourcesCleanupInterval = 0\n"
f"FreeTextBatchSize = 100000\n"
# f"SingleCPU = 0\n"
f"PrefixResultNames = 0\n"
f"RdfFreeTextRulesSize = 100\n"
f"IndexTreeMaps = 256\n"
f"MaxMemPoolSize = 200000000\n"
f"PrefixResultNames = 0\n"
f"MacSpotlight = 0\n"
f"IndexTreeMaps = 64\n"
f"NumberOfBuffers = {numberOfBuffers}\n"
f"MaxDirtyBuffers = {maxDirtyBuffers}\n"
f"\n"
f"[SPARQL]\n"
f"ResultSetMaxRows = 50000\n"
f"MaxQueryCostEstimationTime = 600 ; in seconds (increased)\n"
f"MaxQueryExecutionTime = 180; in seconds (increased)\n"
f"\n"
f"[HTTPServer]\n"
f"ServerPort = {httpPort(port)}\n"
f"ServerRoot = /home/name/Freebase/virtuoso-opensource/vsp\n" # 有了这个才可以打开网页版的界面
f"Charset = UTF-8\n"
f"ServerThreads = 15 ; increased from unknown\n"
)
configPath = f"{dbPath}/virtuoso.ini"
# configPath = f"/home/name/Freebase/virtuoso-opensource/database/virtuoso.ini"
print(config)
print()
print(configPath)
print(f"==== Starting Virtuoso server for {dbPath} on port {port}...")
with open(configPath, 'w') as f:
f.write(config)
run(f"{virtuosoPath}/bin/virtuoso-t +configfile {configPath} +wait")
# run(f"virtuoso-t +configfile {configPath} -fd")
def stop(port):
run(f"echo 'shutdown;' | {virtuosoPath}/bin/isql localhost:{isqlPort(port)}")
def status(port):
run(f"echo 'status();' | {virtuosoPath}/bin/isql localhost:{isqlPort(port)}")
############################################################
# Main
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="manage Virtuoso services")
parser.add_argument("action", type=str, help="start or stop")
parser.add_argument("port", type=int, help="port for the SPARQL HTTP endpoint")
parser.add_argument("-d", "--db-path", type=str, help="path to the db directory")
args = parser.parse_args()
if args.action == "start":
if not args.db_path:
print("please specify path to the db directory with -d")
sys.exit()
if not os.path.isdir(args.db_path):
print("the path specified does not exist")
sys.exit()
start(args.db_path, args.port)
elif args.action == "stop":
stop(args.port)
else:
print(f"invalid action: ${args.action}")
sys.exit()
转载自:https://juejin.cn/post/7283690681175113740