Spaces:
Sleeping
Sleeping
| from typing_extensions import Literal | |
| import operator | |
| from typing import Annotated, List, Literal, TypedDict, Any | |
| from langgraph.graph import END, START, StateGraph | |
| from langgraph.types import Command, interrupt | |
| import os | |
| import json | |
| import re | |
| from typing import TypedDict, List, Dict, Optional | |
| import base64 | |
| import requests | |
| from langchain_mistralai import ChatMistralAI | |
| import requests | |
| from tavily import TavilyClient | |
| import gradio as gr | |
| MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY") | |
| TAVILY_API_KEY = os.getenv("TAVILY_API_KEY") | |
| # Step 1 pipeline: Tavily search -> categorize -> (optional) summarize top items -> format output | |
| # Requirements: | |
| # pip install tavily-python langgraph requests | |
| import os | |
| import re | |
| from typing import TypedDict, List, Dict, Any | |
| import requests | |
| from tavily import TavilyClient | |
| # --- State definition for query-based pipeline --- | |
| class State(TypedDict): | |
| query: str | |
| max_results: int | |
| raw_results: List[Dict[str, Any]] # raw items returned by Tavily | |
| categorized: Dict[str, List[Dict[str, Any]]] # buckets -> list of items | |
| summaries: Dict[str, List[Dict[str, str]]] # bucket -> list of {url, title, summary} | |
| final_output: str | |
| # --- Helpers: simple domain and keyword heuristics for categorization --- | |
| RESEARCH_DOMAINS = [ | |
| r"\.edu$", r"arxiv\.org", r"nature\.com", r"sciencemag\.org", r"ieeexplore\.ieee\.org", | |
| r"acm\.org", r"pubmed\.ncbi\.nlm\.nih\.gov" | |
| ] | |
| NEWS_DOMAINS = [r"\.com$", r"\.news$", r"nytimes\.com", r"theguardian\.com", r"reuters\.com", r"bbc\.co"] | |
| BLOG_KEYWORDS = ["blog", "opinion", "medium.com", "substack", "dev.to"] | |
| BEGINNER_KEYWORDS = ["introduction", "what is", "beginner", "tutorial", "guide", "overview"] | |
| def domain_matches(url: str, patterns: List[str]) -> bool: | |
| for p in patterns: | |
| if re.search(p, url): | |
| return True | |
| return False | |
| def score_item_for_buckets(item: Dict[str, Any]) -> str: | |
| # item expected to contain 'url' and optional 'title' and 'snippet' | |
| url = item.get("url", "") | |
| title = (item.get("title") or "").lower() | |
| snippet = (item.get("snippet") or "").lower() | |
| # research heuristics | |
| if domain_matches(url, RESEARCH_DOMAINS) or "pdf" in url or "arxiv" in url: | |
| return "π§ Research / Academic" | |
| # news heuristics | |
| if domain_matches(url, NEWS_DOMAINS) and any(word in title+snippet for word in ["news", "breaking", "report", "update"]): | |
| return "π° Recent News / Updates" | |
| # blog / opinion heuristics | |
| if any(k in url for k in BLOG_KEYWORDS) or any(k in title+snippet for k in ["opinion", "column", "blog", "i think"]): | |
| return "π¬ Opinion / Blog / Casual" | |
| # beginner heuristics | |
| if any(k in title+snippet for k in BEGINNER_KEYWORDS) or "wikipedia.org" in url: | |
| return "π General / Beginner" | |
| # fallback: decide based on domain (news-like domains often news) | |
| if domain_matches(url, NEWS_DOMAINS): | |
| return "π° Recent News / Updates" | |
| # fallback default | |
| return "π General / Beginner" | |
| # --- Node: perform Tavily search --- | |
| def perform_search(state: State) -> State: | |
| api_key = os.getenv("TAVILY_API_KEY") | |
| if not api_key: | |
| raise EnvironmentError("TAVILY_API_KEY is required in environment") | |
| client = TavilyClient(api_key) | |
| # β Use fallback value safely | |
| max_results = state.get("max_results", 10) | |
| # β Use the local variable instead of state["max_results"] | |
| resp = client.search(query=state["query"], max_results=max_results) | |
| # The exact shape depends on Tavily client; adapt below if fields differ | |
| items: List[Dict[str, Any]] = [] | |
| for r in resp.get("results", resp)[:max_results]: | |
| url = r.get("url") or r.get("link") or r.get("document_url") or r.get("source") | |
| title = r.get("title") or r.get("headline") or "" | |
| snippet = r.get("snippet") or r.get("summary") or r.get("excerpt") or r.get("text") or "" | |
| items.append({"url": url, "title": title, "snippet": snippet, "raw": r}) | |
| return {**state, "raw_results": items} | |
| # --- Node: categorize results into the four buckets --- | |
| def categorize_results(state: State) -> State: | |
| buckets = { | |
| "π§ Research / Academic": [], | |
| "π General / Beginner": [], | |
| "π° Recent News / Updates": [], | |
| "π¬ Opinion / Blog / Casual": [] | |
| } | |
| for item in state["raw_results"]: | |
| bucket = score_item_for_buckets(item) | |
| buckets.setdefault(bucket, []).append(item) | |
| return {**state, "categorized": buckets} | |
| # --- Node: summarize top N items per bucket using Mistral --- | |
| MISTRAL_API_KEY = os.getenv("MISTRAL_API_KEY") | |
| MISTRAL_MODEL = "mistral-large-latest" | |
| def summarize_top_items(state: State, top_n: int = 3) -> State: | |
| if not MISTRAL_API_KEY: | |
| # If no key, gracefully skip summarization and return empty summaries | |
| return {**state, "summaries": {k: [{"url": it["url"], "title": it["title"], "summary": ""} for it in v[:top_n]] for k,v in state["categorized"].items()}} | |
| headers = { | |
| "Authorization": f"Bearer {MISTRAL_API_KEY}", | |
| "Content-Type": "application/json" | |
| } | |
| summaries: Dict[str, List[Dict[str,str]]] = {} | |
| for bucket, items in state["categorized"].items(): | |
| bucket_summaries = [] | |
| for it in items[:top_n]: | |
| prompt = f""" | |
| You are an assistant that summarizes webpages. Provide a short (1-2 sentence) summary for the following item. | |
| Return only JSON with keys: title, url, summary. | |
| Title: {it.get('title')} | |
| URL: {it.get('url')} | |
| Snippet/Excerpt: {it.get('snippet')} | |
| If snippet is missing, make a short summary that says "no snippet available". | |
| """ | |
| body = { | |
| "model": MISTRAL_MODEL, | |
| "messages": [{"role": "user", "content": prompt}], | |
| "temperature": 0.0, | |
| "max_tokens": 200 | |
| } | |
| try: | |
| r = requests.post("https://api.mistral.ai/v1/chat/completions", headers=headers, json=body, timeout=15) | |
| r.raise_for_status() | |
| content = r.json()["choices"][0]["message"]["content"] | |
| # Expecting JSON back; be conservative with parsing: | |
| try: | |
| parsed = eval(content) if content.strip().startswith("{") else {"title": it.get("title"), "url": it.get("url"), "summary": content.strip()} | |
| except Exception: | |
| parsed = {"title": it.get("title"), "url": it.get("url"), "summary": content.strip()} | |
| except Exception as e: | |
| parsed = {"title": it.get("title"), "url": it.get("url"), "summary": f"(summary failed: {e})"} | |
| bucket_summaries.append(parsed) | |
| summaries[bucket] = bucket_summaries | |
| return {**state, "summaries": summaries} | |
| # --- Node: format final output --- | |
| def format_output(state: State) -> State: | |
| out_lines = [f"π Query: {state['query']}", ""] | |
| for bucket, items in state["categorized"].items(): | |
| out_lines.append(f"## {bucket} β {len(items)} results") | |
| summaries = state.get("summaries", {}).get(bucket, []) | |
| if summaries: | |
| for s in summaries: | |
| title = s.get("title") or "(no title)" | |
| url = s.get("url") or "(no url)" | |
| summary = s.get("summary") or "" | |
| out_lines.append(f"- {title}\n {url}\n {summary}") | |
| else: | |
| # fall back to listing basic items | |
| for it in items[:5]: | |
| out_lines.append(f"- {it.get('title') or '(no title)'} β {it.get('url')}\n {it.get('snippet') or ''}") | |
| out_lines.append("") | |
| final = "\n".join(out_lines) | |
| return {**state, "final_output": final} | |
| # --- LangGraph wiring (example, mimic your earlier code) --- | |
| # If you use langgraph exactly as in your example, adapt this snippet: | |
| builder = StateGraph(State) | |
| builder.add_node("perform_search", perform_search) | |
| builder.add_node("categorize_results", categorize_results) | |
| builder.add_node("summarize_top_items", summarize_top_items) | |
| builder.add_node("format_output", format_output) | |
| builder.set_entry_point("perform_search") | |
| builder.add_edge("perform_search", "categorize_results") | |
| builder.add_edge("categorize_results", "summarize_top_items") | |
| builder.add_edge("summarize_top_items", "format_output") | |
| graph = builder.compile() | |
| def analyze_text(input_text: str): | |
| try: | |
| state = {"query": input_text} | |
| result = graph.invoke(state) | |
| if "error" in result: | |
| return f"β Error: {result['error']}" | |
| if "final_output" in result: | |
| return result["final_output"] | |
| return "β οΈ No summary generated. Please check the input text and try again." | |
| except Exception as e: | |
| return f"β οΈ Exception: {str(e)}" | |
| iface = gr.Interface( | |
| fn=analyze_text, | |
| inputs=gr.Textbox(label="π Enter a topic youβd like information about"), | |
| outputs=gr.Textbox(label="π Search summary", lines=15), | |
| title="π€ InfoSort", | |
| description="Searches, Sorts, Summarizes." | |
| ) | |
| iface.launch(share=True) | |