TxtAI Beginners Guide

parmarjatin4911@gmail.com - Jan 28 - - Dev Community

TxtAI Beginners Guide

pip install txtai sentencepiece sacremoses fasttext torch torchvision

1. Semantic Search

from txtai import Embeddings

Sample data for indexing

data = [
"US tops 5 million confirmed virus cases",
"Canada's last fully intact ice shelf has suddenly collapsed, forming a Manhattan-sized iceberg",
"Beijing mobilises invasion craft along coast as Taiwan tensions escalate",
"The National Park Service warns against sacrificing slower friends in a bear attack",
"Maine man wins $1M from $25 lottery ticket",
"Make huge profits without work, earn up to $100,000 a day"
]

embeddings = Embeddings(path="sentence-transformers/nli-mpnet-base-v2")
embeddings.index(data)

print("Semantic Search Results:")
for query in ["feel good story", "climate change"]:
uid = embeddings.search(query, 1)[0][0]
print(f"Query: {query}, Result: {data[uid]}")

2. Updates and Deletes

udata = data.copy()

uid = embeddings.search("feel good story", 1)[0][0]
print("\nBefore update:", data[uid])

Update data

udata[0] = "See it: baby panda born"
embeddings.upsert([(0, udata[0], None)])

uid = embeddings.search("feel good story", 1)[0][0]
print("After update:", udata[uid])

Delete record from index

embeddings.delete([0])

uid = embeddings.search("feel good story", 1)[0][0]
print("After delete:", udata[uid])

3. Persistence

print("\nPersistence Test:")
print("Before saving index")
uid = embeddings.search("climate change", 1)[0][0]
print("Result:", data[uid])

embeddings.save("index")
embeddings = Embeddings() # Resetting embeddings instance
embeddings.load("index")

print("After loading index")
uid = embeddings.search("climate change", 1)[0][0]
print("Result:", data[uid])

4. Keyword Search and Dense Vector index

Create embeddings with subindexes

embeddings = Embeddings(
content=True,
defaults=False,
indexes={
"keyword": {
"keyword": True
},
"dense": {
"path": "sentence-transformers/nli-mpnet-base-v2"
}
}
)
embeddings.index(data)
print("Keyword & Dense Index Search Results:")
for query in ["feel good story", "climate change"]:
print(f"Query: {query}, Keyword Result: ")
print(embeddings.search(query, limit=1, index="keyword"))
print("Dense Index Result: ")
print(embeddings.search(query, limit=1, index="dense"))

5. Hybrid Search (Sparse + Dense)

print("\nHybrid Search Results:")
hybrid_embeddings = Embeddings(hybrid=True, path="sentence-transformers/nli-mpnet-base-v2")
hybrid_embeddings.index(data)
for query in ["public health story", "war"]:
uid = hybrid_embeddings.search(query, 1)[0][0]
print(f"Query: {query}, Result: {data[uid]}")

6. Content Storage for large amount of data

print("\nContent Storage Test:")
content_embeddings = Embeddings(content=True, path="sentence-transformers/nli-mpnet-base-v2")
content_embeddings.index(data)
uid = int(content_embeddings.search("wildlife", 1)[0]["id"])
print("Result:", data[uid])

7. Create embeddings with a graph index

embeddings = Embeddings(
path="sentence-transformers/nli-mpnet-base-v2",
content=True,
functions=[
{"name": "graph", "function": "graph.attribute"},
],
expressions=[
{"name": "category", "expression": "graph(indexid, 'category')"},
{"name": "topic", "expression": "graph(indexid, 'topic')"},
],
graph={
"topics": {
"categories": ["health", "climate", "finance", "world politics"]
}
}
)

embeddings.index(data)
print("Graph Embeddings Result:")
print(embeddings.search("select topic, category, text from txtai"))

8. Using LLM

import torch
from txtai.pipeline import LLM

llm = LLM("google/flan-t5-large", torch_dtype=torch.float32)
query = "Where is one place you'd go in Washington, DC?"
result = llm(query)
print("LLM Standalone Result:")
print("Query: ", query, result)

9. RAG (Retrieval-Augmented Generation)

from txtai.pipeline import Extractor

llm_embeddings = Embeddings(path="sentence-transformers/nli-mpnet-base-v2", content=True, autoid="uuid5")
llm_embeddings.index(data)

extractor = Extractor(llm_embeddings, "google/flan-t5-large")

llm_query = "What country is having issues with climate change?"
context = lambda question: [{"query": question, "question": f"Answer the following question using the context below.\nQuestion: {question}\nContext:"}]
print("RAG Result:")
print(extractor(context(llm_query))[0])

10. Language Model Workflows

from txtai import Application

app = Application("embeddings.yml")

Add data and index

app.add([{"id": idx, "text": text} for idx, text in enumerate(data)])
app.index()

Execute the workflow

print("\nLanguage Model Workflows Result:")
print(list(app.workflow("search", ["select text from txtai where similar('feel good story') limit 1"])))
print(app.search("select translation(text, 'ta') text from txtai where similar('feel good story') limit 1"))

. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Terabox Video Player