PDF μλ² λ© Β· BM25+Vector νμ΄λΈλ¦¬λ Β· Cohere Rerank Β· μΆμ² μΈμ© Q&A
W5μ λ¨μ λ²‘ν° κ²μμ μλ―Έλ λΉμ·νμ§λ§ μ νν μ«μ/λ μ§λ₯Ό λμΉλ μ½μ μ΄ μμ΅λλ€. W6μμλ:
| μλλ¦¬μ€ | 벑ν°λ§ | ν€μλλ§ | νμ΄λΈλ¦¬λ |
|---|---|---|---|
| "λ§€μΆμ΄ μ΄λ»κ² λ³νλ?" (μλ―Έ) | β μν¨ | β λμμ΄ λͺ» μ‘μ | β |
| "2024λ 4λΆκΈ° μμ μ΄μ΅μ?" (μ νκ°) | β λΉμ·ν λΆκΈ° μμ | β μ ν | β |
| "HBM3E" (κ³ μ λͺ μ¬) | β μΌλ°νλ¨ | β μ ν | β |
create table disclosures (
id bigserial primary key,
ticker text not null,
doc_type text, -- 'μ¬μ
λ³΄κ³ μ'|'λΆκΈ°'|'μ£Όμ곡μ'
doc_year int,
page int,
section text,
content text,
embedding vector(1536),
fts tsvector -- νκ΅μ΄ BM25μ©
);
-- μΈλ±μ€
create index on disclosures using ivfflat (embedding vector_cosine_ops);
create index on disclosures using gin (fts);
-- Triggerλ‘ fts μλ μμ±
create function disclosures_fts_trigger() returns trigger as $$
begin
new.fts := to_tsvector('simple', new.content);
return new;
end $$ language plpgsql;
create trigger disclosures_fts_update before insert or update
on disclosures for each row execute function disclosures_fts_trigger();
POST https://api.cohere.com/v1/rerank
Authorization: Bearer {COHERE_KEY}
{
"model": "rerank-multilingual-v3.0",
"query": "{{μ¬μ©μ μ§λ¬Έ}}",
"documents": [
"{{μ²ν¬ λ³Έλ¬Έ 1}}", "{{μ²ν¬ λ³Έλ¬Έ 2}}", ...
],
"top_n": 5,
"return_documents": true
}
λλ IR μλ£ λΆμκ°λ€. μλ κ²μ κ²°κ³Όλ§μΌλ‘ λ΅λ³νκ³ , λ°λμ μΆμ²λ₯Ό μΈμ©νλΌ.
μ§λ¬Έ: {{μ¬μ©μ μ§λ¬Έ}}
κ²μ κ²°κ³Ό (top 5, μ λ’°λ μ):
[1] {ticker} {doc_type} p.{page}: {content}
[2] ...
μλ΅ κ·μΉ:
- κ²μ κ²°κ³Όμ λͺ
μλ μ¬μ€λ§ μ¬μ©. μΈλΆ μ§μ κΈμ§.
- λͺ¨λ μ£Όμ₯μ [μ«μ] νμ μΆμ² μΈμ©.
- κ²μ κ²°κ³Όλ‘ λ΅ν μ μμΌλ©΄ "κ·Όκ±° λΆμ‘±" λͺ
μ.
λ΅λ³:
λ³Έ μ£Όμ°¨ λ³ΈνΈμ Supabase pgvector + ts_rankλ‘ hybridλ₯Ό ꡬννμ§λ§, Qdrant 1.10+λ λμΌν νλ¦μ ν λ²μ API νΈμΆλ‘ λλ΄λ native hybridλ₯Ό μ 곡ν©λλ€. μ¬μ λ³΄κ³ μ·곡μΒ· μ λ리μ€νΈ 리ν¬νΈ κ²μμ κ·Έλλ‘ μ μ©ν΄λ΄ μλ€.
| νλͺ© | Supabase pgvector + ts_rank | Qdrant Native Hybrid |
|---|---|---|
| νΈμΆ νμ | 2ν (vector + BM25 λ°λ‘) | 1ν (Query API prefetch) |
| RRF κ²°ν© | n8n Code λ Έλμμ μλ | μλ² μΈ‘ μλ |
| Sparse λͺ¨λΈ | Postgres νκ΅μ΄ ννμ νκ³ | BM25 / SPLADE μ ν |
| νκ΅μ΄ μ²λ¦¬ | tsvector νκ³ (μ‘°μ¬ μ²λ¦¬) | fastembed λ€κ΅μ΄ κ°λ₯ |
| μ€μΌμΌ | 500MB 무λ£, λ¨μΌ PG | 1GB 무λ£, λΆμ° κ°λ₯ |
| νμ΅ κ³‘μ | SQL μ΅μνλ©΄ μ¬μ | Named vector κ°λ νμ |
collection: "invest_disclosures"
named vectors:
- name: "dense"
size: 1536
distance: Cosine # OpenAI text-embedding-3-small
- name: "sparse"
modifier: idf # BM25 λ΄μ₯ IDF
# sparse vector indices/values νμ
payload:
- ticker, doc_type, doc_year, page, section, content
μλ² λ© λ¨κ³μμ ν μ²ν¬λΉ dense + sparse λ 벑ν°λ₯Ό λμμ μ μ₯. κ²μμ λμ ν νΈμΆλ‘.
PUT https://YOUR-CLUSTER.qdrant.tech/collections/invest_disclosures
{
"vectors": {
"dense": { "size": 1536, "distance": "Cosine" }
},
"sparse_vectors": {
"sparse": { "modifier": "idf" }
}
}
PUT /collections/invest_disclosures/points
{
"points": [{
"id": 1,
"vector": {
"dense": [0.12, -0.05, ...1536κ°],
"sparse": { "indices": [42, 1024, ...], "values": [0.7, 0.3, ...] }
},
"payload": {
"ticker": "005930",
"doc_type": "μ¬μ
λ³΄κ³ μ", "doc_year": 2024, "page": 187,
"section": "μ°κ΅¬κ°λ° νλ", "content": "..."
}
}]
}
POST /collections/invest_disclosures/points/query
{
"prefetch": [
{ "query": [0.12, -0.05, ...], "using": "dense", "limit": 20 },
{ "query": {"indices":[...], "values":[...]}, "using": "sparse", "limit": 20 }
],
"query": { "fusion": "rrf" },
"limit": 5,
"with_payload": true,
"filter": { "must": [{"key": "ticker", "match": {"value": "005930"}}] }
}
μλ΅μ λ κ²μμ RRF κ²°ν© top-5. "fusion": "dbsf"λ‘ distribution-based score fusionλ μ ν κ°λ₯.
| κΈμ΅ μ§μ | Denseλ§ | BM25λ§ | Hybrid |
|---|---|---|---|
| "μΌμ±μ μ 24λ R&D ν¬μμ‘μ?" (μ ν μ«μ) | β³ λΉμ·ν λΆκΈ° μμ | β μ ν νμ΄μ§ | β |
| "AI λ©λͺ¨λ¦¬ μ¬μ΄ν΄ μ λ§" (μλ―Έ) | β λμμ΄ μ‘μ | β ν€μλ νμ | β |
| "HBM3E λ¨λ 곡κΈ" (κ³ μ λͺ μ¬) | β³ μΌλ°νλ¨ | β μ ν | β |
| "μ§λ°°κ΅¬μ‘° ESG 리μ€ν¬" (κ°λ ) | β μλ―Έ κ²μ | β³ νν λ€μ | β |
| "곡μλ²νΈ 20240315000123" (μλ³μ) | β | β exact | β |
W6_BONUS_qdrant_hybrid.json + W6_BONUS_μνμ§μ.xlsx