Extract structured data from unstructured text

pip install langchain transformers torch sentence-transformers accelerate bitsandbytes

Code

from langchain.chains import create_extraction_chain

from pydantic import BaseModel, Field

from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from langchain_community.llms import HuggingFacePipeline

# Define your Pydantic schema

class Person(BaseModel):

    name: str = Field(description="Full name of the person")

    age: int = Field(description="Age of the person")

    location: str = Field(description="City/country of residence")

    interests: list[str] = Field(description="List of hobbies")

# Load a local quantized model (runs on CPU)

model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"  # ~5GB RAM usage

model_file = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"

# Initialize model

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(

    model_name,

    model_file=model_file,

    device_map="auto"  # Automatically uses CPU/GPU

# Create Hugging Face pipeline

pipe = pipeline(

    "text-generation",

    model=model,

    tokenizer=tokenizer,

    max_new_tokens=200,

    temperature=0.1

# Wrap in LangChain

llm = HuggingFacePipeline(pipeline=pipe)

# Create extraction chain

chain = create_extraction_chain(

    schema_pydantic=Person,

    llm=llm,

    verbose=True

# Example text

text = """

Sarah Johnson, a 34-year-old architect from Berlin, enjoys photography and rock climbing.

Her colleague Michael Chen (29) lives in Singapore and likes jazz music and cooking.

"""

# Run extraction

result = chain.run(text)

print(result)