百度360必应搜狗淘宝本站头条
当前位置:网站首页 > IT技术 > 正文

Oracle向量数据库操作的一些随手笔记

wptr33 2024-12-26 17:07 38 浏览

1. Basic Demo:

| c(2,6). . b(5,6)
| .
| .
| a(2,2)
|_________________________

|b-a| = sqrt( (5-2)^2 + (6-2)^2 ) = 5

SELECT VECTOR_DISTANCE( vector('[2,2]'), vector('[5,6]'), EUCLIDEAN ) as distance;

How about COSINE?

CREATE TABLE IF NOT EXISTS embedding_store_hysun (
collection_name VARCHAR2(200) NOT NULL,
embedding VECTOR(*, FLOAT32) NOT NULL,
doc CLOB NOT NULL,
src VARCHAR2(500)
);

############################ In database embedding ############################

#EXEC DBMS_VECTOR.DROP_ONNX_MODEL(model_name => 'doc_model', force => true);
#SQL> grant DB_DEVELOPER_ROLE to vector;
SQL> grant create mining model to pocuser;
Grant succeeded.
SQL> create or replace directory HYSUN_DUMP as '/u01/ords_sw/hysun_dump';
Directory HYSUN_DUMP created.
SQL> grant read on directory HYSUN_DUMP to pocuser;
Grant succeeded.

EXECUTE DBMS_VECTOR.LOAD_ONNX_MODEL('HYSUN_DUMP','bge-base-zh-v1.5.onnx','hysun_bge_zh_model',JSON('{"function" : "embedding", "embeddingOutput" : "embedding"}'));

SELECT MODEL_NAME, MINING_FUNCTION, ALGORITHM, ALGORITHM_TYPE, MODEL_SIZE
FROM USER_MINING_MODELS;

SQL> INSERT INTO embedding_store_hysun select 'DB_EMBED_TEST0', VECTOR_EMBEDDING(hysun_bge_zh_model USING 'Minimum Age to Get a Licence The minimum age to get a licence. minimum age' as input), 'Minimum Age to Get a Licence The minimum age to get a licence. minimum age', '/home/hysunhe/projects/oracle_vectordb/source_data/cdc_poc/QA_1.txt' from dual;
1 row inserted.

SQL> INSERT INTO embedding_store_hysun select 'DB_EMBED_TEST0', VECTOR_EMBEDDING(hysun_bge_zh_model USING 'Minimum Requirements for Enrolment The list of requirements/ enrolment prerequisites that needs to be met before enrolment. class 3/3a, Class 3A, class 2B, class 2, minimum requirements, enrolment' as input), 'Minimum Requirements for Enrolment The list of requirements/ enrolment prerequisites that needs to be met before enrolment. class 3/3a, Class 3A, class 2B, class 2, minimum requirements, enrolment', '/home/hysunhe/projects/oracle_vectordb/source_data/cdc_poc/QA_2.txt' from dual;
1 row inserted.

SQL> SELECT VECTOR_EMBEDDING(hysun_bge_zh_model USING 'mininum age to get a license' as input) AS embedding;

SELECT
collection_name,
embedding,
doc,
src,
VECTOR_DISTANCE(embedding, VECTOR_EMBEDDING(hysun_bge_zh_model USING 'mininum age to get a license' as input), COSINE) as distance
FROM embedding_store_hysun
WHERE
collection_name = 'DB_EMBED_TEST0'
ORDER BY distance
FETCH FIRST 3 ROWS ONLY;

######################## In database embedding end ########################

### Index:

show parameter vector_memory_size;
ALTER SYSTEM SET vector_memory_size=ON SCOPE=BOTH;
SELECT value FROM V$PARAMETER WHERE name='sga_target'; -- (max vector_memory_size = 70% SGA)
SELECT CON_ID, sum(alloc_bytes) / 1024 / 1024 FROM V$VECTOR_MEMORY_POOL GROUP BY CON_ID;
SELECT CON_ID, sum(USED_BYTES) / 1024 / 1024 FROM V$VECTOR_MEMORY_POOL GROUP BY CON_ID;

############################################################

In-Memory Neighbor Graph Vector Index(HNSW)

############################################################

create table galaxies (id number, name varchar2(50), doc varchar2(500), embedding vector);
insert into galaxies values (1, 'M31', 'Messier 31 is a barred spiral galaxy in the Andromeda constellation which has a lot of barred spiral galaxies.', '[0,2,2,0,0]');
insert into galaxies values (2, 'M33', 'Messier 33 is a spiral galaxy in the Triangulum constellation.', '[0,0,1,0,0]');
insert into galaxies values (3, 'M58', 'Messier 58 is an intermediate barred spiral galaxy in the Virgo constellation.', '[1,1,1,0,0]');
insert into galaxies values (4, 'M63', 'Messier 63 is a spiral galaxy in the Canes Venatici constellation.', '[0,0,1,0,0]');
insert into galaxies values (5, 'M77', 'Messier 77 is a barred spiral galaxy in the Cetus constellation.', '[0,1,1,0,0]');
insert into galaxies values (6, 'M91', 'Messier 91 is a barred spiral galaxy in the Coma Berenices constellation.', '[0,1,1,0,0]');
insert into galaxies values (7, 'M49', 'Messier 49 is a giant elliptical galaxy in the Virgo constellation.', '[0,0,0,1,1]');
insert into galaxies values (8, 'M60', 'Messier 60 is an elliptical galaxy in the Virgo constellation.', '[0,0,0,0,1]');
insert into galaxies values (9, 'NGC1073', 'NGC 1073 is a barred spiral galaxy in Cetus constellation.', '[0,1,1,0,0]');
SELECT name
FROM galaxies
ORDER BY VECTOR_DISTANCE( embedding, to_vector('[0,1,1,0,0]'), COSINE )
FETCH FIRST 3 ROWS ONLY;
SELECT name,
ROUND( VECTOR_DISTANCE( embedding, to_vector('[0,1,1,0,0]'), COSINE ), 2) as distance
FROM galaxies
ORDER BY distance
FETCH APPROXIMATE FIRST 4 ROWS ONLY;
-- WITH TARGET ACCURACY 90
EXPLAIN PLAN FOR
SELECT name,
VECTOR_DISTANCE( embedding, to_vector('[0,1,1,0,0]'), COSINE ) as distance
FROM galaxies
ORDER BY distance
FETCH APPROXIMATE FIRST 4 ROWS ONLY;
select plan_table_output from table(dbms_xplan.display('plan_table',null,'all'));
CREATE VECTOR INDEX galaxies_hnsw_idx ON galaxies (embedding) ORGANIZATION
INMEMORY NEIGHBOR GRAPH
DISTANCE COSINE
WITH TARGET ACCURACY 95;
CREATE VECTOR INDEX galaxies_hnsw_idx ON galaxies (embedding) ORGANIZATION
INMEMORY NEIGHBOR GRAPH
DISTANCE COSINE
WITH TARGET ACCURACY 90 PARAMETERS (type HNSW, neighbors 40, efconstruction
500);
SELECT name,
ROUND(VECTOR_DISTANCE( embedding, to_vector('[0,1,1,0,0]'), COSINE ), 3) distance
FROM galaxies
WHERE name <> 'NGC1073'
ORDER BY distance
FETCH APPROXIMATE FIRST 4 ROWS ONLY WITH TARGET ACCURACY 90;
drop INDEX galaxies_hnsw_idx;

##############################################################

Neighbor Partition Vector Index (IVF)

##############################################################

CREATE VECTOR INDEX galaxies_ivf_idx ON galaxies (embedding) ORGANIZATION
NEIGHBOR PARTITIONS
DISTANCE COSINE
WITH TARGET ACCURACY 95;
CREATE VECTOR INDEX galaxies_ivf_idx ON galaxies (embedding) ORGANIZATION
NEIGHBOR PARTITIONS
DISTANCE COSINE
WITH TARGET ACCURACY 90 PARAMETERS (type IVF, neighbor partitions 100);
The APPROX and APPROXIMATE keywords are optional. If omitted while connected to an
ADB-S instance, an approximate search using a vector index is attempted if one
exists.
-- Accuracy report
SET SERVEROUTPUT ON
declare
report varchar2(128);
begin
report := dbms_vector.index_accuracy_query(
OWNER_NAME => 'POCUSER',
INDEX_NAME => 'GALAXIES_IVF_IDX',
qv => to_vector('[0,1,1,0,0]'),
top_K => 10,
target_accuracy => 95 );
dbms_output.put_line(report);
end;
/

-- Index detail:

grant read on VECSYS.VECTOR$INDEX to pocuser;
SELECT JSON_SERIALIZE(IDX_PARAMS RETURNING VARCHAR2 PRETTY)
FROM VECSYS.VECTOR$INDEX WHERE IDX_NAME = 'GALAXIES_IVF_IDX';
CREATE PUBLIC DATABASE LINK LinkToLA1 CONNECT TO vectordemo IDENTIFIED BY "welcome1" USING '146.235.233.91:1521/pdb1.sub08030309530.justinvnc1.oraclevcn.com';
select OWNER, DB_LINK, USERNAME, VALID, HOST from all_db_links;
alter session set global_names=false;
select 1 from dual@LINKTOLA1;

#### Memo

grant create any directory to pocuser;
create directory RAG_DOC_DIR as '/u01/hysun/rag_docs';
create table RAG_FILES (
file_name varchar2(500),
file_content BLOB
);
create table RAG_INDB_PIPELINE (
id number,
name varchar2(50),
doc varchar2(500),
embedding VECTOR
);
Declare
mFile VARCHAR2(500) := 'Oracle向量数据库_lab.pdf';
mBLOB BLOB := Empty_Blob();
mBinFile BFILE := BFILENAME('RAG_DOC_DIR', mFile);
Begin
DBMS_LOB.OPEN(mBinFile, DBMS_LOB.LOB_READONLY); -- Open BFILE
DBMS_LOB.CreateTemporary(mBLOB, TRUE, DBMS_LOB.Session); -- BLOB locator initialization
DBMS_LOB.OPEN(mBLOB, DBMS_LOB.LOB_READWRITE); -- Open BLOB locator for writing
DBMS_LOB.LoadFromFile(mBLOB, mBinFile, DBMS_LOB.getLength(mBinFile)); -- Reading BFILE into BLOB
DBMS_LOB.CLOSE(mBLOB); -- Close BLOB locator
DBMS_LOB.CLOSE(mBinFile); -- Close BFILE

INSERT INTO RAG_FILES(file_name, file_content) values (mFile, mBLOB);
commit;
End;
/
insert into RAG_FILES(file_name, file_content) values('oracle-vector-lab', to_blob(bfilename('RAG_DOC_DIR', 'Oracle向量数据库_lab.pdf')));
commit;
select DBMS_LOB.getLength(FILE_CONTENT) from RAG_FILES;
drop table rag_doc_chunks purge;
create table rag_doc_chunks (doc_id varchar2(500), chunk_id number, chunk_data varchar2(4000), chunk_embedding vector);
-- utl_to_text: PDF -> TEXT
-- utl_to_chunks: TEXT -> CHUNKS
-- utl_to_embeddings: CHUNKS -> VECTORS
insert into rag_doc_chunks
select
dt.file_name doc_id,
et.embed_id chunk_id,
et.embed_data chunk_data,
to_vector(et.embed_vector) chunk_embedding
from
rag_files dt,
dbms_vector_chain.utl_to_embeddings(
dbms_vector_chain.utl_to_chunks(
dbms_vector_chain.utl_to_text(dt.file_content),
json('{"normalize":"all"}')
),
json('{"provider":"database", "model":"mydoc_model"}')
) t,
JSON_TABLE(
t.column_value,
'$[*]' COLUMNS (
embed_id NUMBER PATH '$.embed_id',
embed_data VARCHAR2(4000) PATH '$.embed_data',
embed_vector CLOB PATH '$.embed_vector'
)
) et;
commit;
insert into rag_doc_chunks
select
dt.file_name doc_id,
et.embed_id chunk_id,
et.embed_data chunk_data,
to_vector(et.embed_vector) chunk_embedding
from
rag_files dt,
dbms_vector_chain.utl_to_embeddings(
dbms_vector_chain.utl_to_chunks(
dbms_vector_chain.utl_to_text(dt.file_content),
JSON('{ "by":"words",
"max":"240",
"overlap":"15",
"split":"recursively",
"language":"SIMPLIFIED CHINESE",
"normalize":"all" }')
),
json('{"provider":"database", "model":"mydoc_model"}')
) t,
JSON_TABLE(
t.column_value,
'$[*]' COLUMNS (
embed_id NUMBER PATH '$.embed_id',
embed_data VARCHAR2(4000) PATH '$.embed_data',
embed_vector CLOB PATH '$.embed_vector'
)
) et;
commit;
select
dbms_vector_chain.utl_to_chunks(TO_CLOB(FILE_CONTENT),
JSON('{ "by":"words",
"max":"240",
"overlap":"15",
"split":"recursively",
"language":"SIMPLIFIED CHINESE",
"normalize":"all" }'))
from RAG_FILES;
SELECT
dbms_vector.utl_to_embedding(
'This is a test',
json('{
"provider": "OCIGenAI",
"credential_name": "OCI_GENAI_CRED_FOR_APEX",
"url": "https://inference.generativeai.us-chicago-1.oci.oraclecloud.com/20231130/actions/embedText",
"model": "cohere.embed-multilingual-v3.0"
}')
) embedding
FROM dual;
SELECT
dbms_vector.utl_to_embedding(
'This is a test',
json('{
"provider": "database",
"model": "doc_model"
}')
) embedding
FROM dual;
create or replace directory MODELS_DIR as '/u01/hysun/models';
EXEC DBMS_VECTOR.DROP_ONNX_MODEL(model_name => 'mydoc_model', force => true);
-- BEGIN
-- DBMS_VECTOR.LOAD_ONNX_MODEL(
-- directory => 'MODELS_DIR',
-- file_name => 'bge-base-zh-v1.5.onnx',
-- model_name => 'mydoc_model',
-- metadata => JSON('{"function" : "embedding", "embeddingOutput" : "embedding", "input":{"input": ["DATA"]}}')
-- );
-- END;
-- /
BEGIN
DBMS_VECTOR.LOAD_ONNX_MODEL(
directory => 'MODELS_DIR',
file_name => 'bge-base-zh-v1.5.onnx',
model_name => 'mydoc_model'
);
END;
/
SELECT vector_embedding(mydoc_model using 'hello' as data);
select
chunk_data,
VECTOR_DISTANCE(chunk_embedding, VECTOR_EMBEDDING(mydoc_model USING '本次实验的先决条件' as data), COSINE) as distance
from rag_doc_chunks
order by distance
FETCH APPROX FIRST 1 ROWS ONLY;
-- grant CREATE CREDENTIAL
BEGIN
DBMS_VECTOR_CHAIN.CREATE_CREDENTIAL (
CREDENTIAL_NAME => 'LAB_OPENAI_CRED',
PARAMS => json('{ "access_token": "EMPTY" }')
);
END;
/
select dbms_vector_chain.utl_to_generate_text(
'Oracle 向量数据库是什么',
json('{
"provider": "openai",
"credential_name": "LAB_OPENAI_CRED",
"url": "http://146.235.226.110:8098/v1/chat/completions",
"model": "Qwen2-7B-Instruct"
}') ) from dual;
select *
from (
select
chunk_data
from rag_doc_chunks
order by VECTOR_DISTANCE(chunk_embedding, VECTOR_EMBEDDING(mydoc_model USING '本次实验的先决条件' as data), COSINE)
FETCH APPROX FIRST 3 ROWS ONLY
) dt,
dbms_vector_chain.utl_to_generate_text(
dt.chunk_data,
json('{
"provider": "openai",
"credential_name": "LAB_OPENAI_CRED",
"url": "http://146.235.226.110:8098/v1/chat/completions",
"model": "Qwen2-7B-Instruct"
}')
) rag
declare
l_question varchar2(500) := '本次实验的先决条件';
l_input CLOB;
l_clob CLOB;
j apex_json.t_values;
l_context CLOB;
l_rag_result CLOB;
begin
-- 第一步:从向量数据库中检索出与问题相似的内容
for rec in (
select
chunk_data
from rag_doc_chunks
order by VECTOR_DISTANCE(chunk_embedding, VECTOR_EMBEDDING(mydoc_model USING l_question as data), COSINE)
FETCH APPROX FIRST 3 ROWS ONLY
) loop
l_context := l_context || rec.chunk_data || chr(10);
end loop;

-- 第二步:提示工程:将相似内容和用户问题一起,组成大语言模型的输入
l_input := '你是一个诚实且专业的数据库知识问答助手,请仅仅根据提供的上下文信息内容,回答用户的问题,且不要试图编造答案。\n 以下是上下文信息:' || replace(l_context, chr(10), '\n') || '\n请用英文回答用户问题:' || l_question;


-- 第三步:调用大语言模型,生成RAG结果
for rec in (select dbms_vector_chain.utl_to_generate_text(
l_input,
json('{
"provider": "openai",
"credential_name": "LAB_OPENAI_CRED",
"url": "http://146.235.226.110:8098/v1/chat/completions",
"model": "Qwen2-7B-Instruct"
}')
) as rag from dual) loop
dbms_output.put_line('*** RAG Result: ' || rec.rag);
end loop;
-- apex_json.parse(j, l_clob);
-- l_rag_result := apex_json.get_varchar2(p_path => 'choices[%d].message.content', p0 => 1, p_values => j);

-- dbms_output.put_line('*** RAG Result: ' || l_rag_result);
end;
/

```

srvctl stop instance -d ai23 -i ai232 -force
srvctl status database -d ai23
srvctl start instance -d ai23 -i ai232

相关推荐

Linux高性能服务器设计

C10K和C10M计算机领域的很多技术都是需求推动的,上世纪90年代,由于互联网的飞速发展,网络服务器无法支撑快速增长的用户规模。1999年,DanKegel提出了著名的C10问题:一台服务器上同时...

独立游戏开发者常犯的十大错误

...

学C了一头雾水该咋办?

学C了一头雾水该怎么办?最简单的方法就是你再学一遍呗。俗话说熟能生巧,铁杵也能磨成针。但是一味的为学而学,这个好像没什么卵用。为什么学了还是一头雾水,重点就在这,找出为什么会这个样子?1、概念理解不深...

C++基础语法梳理:inline 内联函数!虚函数可以是内联函数吗?

上节我们分析了C++基础语法的const,static以及this指针,那么这节内容我们来看一下inline内联函数吧!inline内联函数...

C语言实战小游戏:井字棋(三子棋)大战!文内含有源码

井字棋是黑白棋的一种。井字棋是一种民间传统游戏,又叫九宫棋、圈圈叉叉、一条龙、三子旗等。将正方形对角线连起来,相对两边依次摆上三个双方棋子,只要将自己的三个棋子走成一条线,对方就算输了。但是,有很多时...

C++语言到底是不是C语言的超集之一

C与C++两个关系亲密的编程语言,它们本质上是两中语言,只是C++语言设计时要求尽可能的兼容C语言特性,因此C语言中99%以上的功能都可以使用C++完成。本文探讨那些存在于C语言中的特性,但是在C++...

在C++中,如何避免出现Bug?

C++中的主要问题之一是存在大量行为未定义或对程序员来说意外的构造。我们在使用静态分析器检查各种项目时经常会遇到这些问题。但正如我们所知,最佳做法是在编译阶段尽早检测错误。让我们来看看现代C++中的一...

ESL-通过事件控制FreeSWITCH

通过事件提供的最底层控制机制,允许我们有效地利用工具箱,适时选择使用其中的单个工具。FreeSWITCH是一个核心交换与混合矩阵,它周围有几十个模块提供各种功能特性。我们完全控制了所有的即时信息,这些...

物理老师教你学C++语言(中篇)

一、条件语句与实验判断...

C语言入门指南

当然!以下是关于C语言入门编程的基础介绍和入门建议,希望能帮你顺利起步:C语言入门指南...

C++选择结构,让程序自动进行决策

什么是选择结构?正常的程序都是从上至下顺序执行,这就是顺序结构...

C++特性使用建议

1.引用参数使用引用替代指针且所有不变的引用参数必须加上const。在C语言中,如果函数需要修改变量的值,参数必须为指针,如...

C++程序员学习Zig指南(中篇)

1.复合数据类型结构体与方法的对比C++类:...

研一自学C++啃得动吗?

研一自学C++啃得动吗?在开始前我有一些资料,是我根据网友给的问题精心整理了一份「C++的资料从专业入门到高级教程」,点个关注在评论区回复“888”之后私信回复“888”,全部无偿共享给大家!!!个人...

C++关键字介绍

下表列出了C++中的常用关键字,这些关键字不能作为变量名或其他标识符名称。1、autoC++11的auto用于表示变量的自动类型推断。即在声明变量的时候,根据变量初始值的类型自动为此变量选择匹配的...