Extract structured data from unstructured text
- Mohammed Jassim Jasmin

- Feb 4
- 1 min read
pip install langchain transformers torch sentence-transformers accelerate bitsandbytesCode
from langchain.chains import create_extraction_chainfrom pydantic import BaseModel, Fieldfrom transformers import AutoModelForCausalLM, AutoTokenizer, pipelinefrom langchain_community.llms import HuggingFacePipeline# Define your Pydantic schemaclass Person(BaseModel): name: str = Field(description="Full name of the person") age: int = Field(description="Age of the person") location: str = Field(description="City/country of residence") interests: list[str] = Field(description="List of hobbies")# Load a local quantized model (runs on CPU)model_name = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF" # ~5GB RAM usagemodel_file = "mistral-7b-instruct-v0.2.Q4_K_M.gguf"# Initialize modeltokenizer = AutoTokenizer.from_pretrained(model_name)model = AutoModelForCausalLM.from_pretrained( model_name, model_file=model_file, device_map="auto" # Automatically uses CPU/GPU)# Create Hugging Face pipelinepipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, max_new_tokens=200, temperature=0.1)# Wrap in LangChainllm = HuggingFacePipeline(pipeline=pipe)# Create extraction chainchain = create_extraction_chain( schema_pydantic=Person, llm=llm, verbose=True)# Example texttext = """Sarah Johnson, a 34-year-old architect from Berlin, enjoys photography and rock climbing. Her colleague Michael Chen (29) lives in Singapore and likes jazz music and cooking."""# Run extractionresult = chain.run(text)print(result)




Comments