model: base_url: "http://127.0.0.1:8000" max_tokens: 512 temperature: 0.1 timeout: 30 datasets: benchmark_dataset: file_path: "ner_benchmark_dataset.jsonl" sample_size: 100 # Use first 100 examples for quick benchmarking instruction_field: "instruction" input_field: "input" expected_output_field: "response" metrics: # Primary metrics for HuggingFace entity_recognition: name: "Entity Recognition F1 Score" description: "F1 score for named entity recognition accuracy" type: "f1" precision: name: "Precision Score" description: "Precision for entity recognition" type: "precision" recall: name: "Recall Score" description: "Recall for entity recognition" type: "recall" latency: name: "Average Latency" description: "Average response time in milliseconds" type: "latency" # Entity type specific performance entity_types: person: name: "Person Entity Recognition" keywords: ["PERSON", "person", "Person"] organization: name: "Organization Entity Recognition" keywords: ["ORG", "organization", "Organization"] location: name: "Location Entity Recognition" keywords: ["LOC", "location", "Location"] miscellaneous: name: "Miscellaneous Entity Recognition" keywords: ["MISC", "miscellaneous", "Miscellaneous"] output: results_file: "benchmarks.txt" detailed_results_file: "benchmark_results.json" include_examples: true max_examples: 10