import requests from datetime import datetime, timedelta from typing import Dict, Optional import re def run_get_request(params: dict): """ Utility function to run GET request against Wikipedia API """ base_url = "https://en.wikipedia.org/w/api.php" # We need to supply headers for the request to work headers = { "User-Agent": f"NoteworthyDifferences/1.0 (j3ffdick@gmail.com) requests/{requests.__version__}" } response = requests.get(base_url, params=params, headers=headers) # Handle HTTP errors response.raise_for_status() try: json_data = response.json() except Exception: raise ValueError(f"Unable to parse response: {response}") return json_data def extract_revision_info(json_data, revision=0): """ Utility function to extract page revision info from JSON data returned from API call Args: revision: revision before current Examples: title = 'David_Szalay' json_data = get_previous_revisions(title, revisions = 100) extract_revision_info(json_data) # Current revision extract_revision_info(json_data, 10) # 10th revision before current extract_revision_info(json_data, 100) # 10th revision before current """ # Extract page and revision info pages = json_data["query"]["pages"] page_id = list(pages.keys())[0] if page_id == "-1": # Page not found, return empty dict return {"revid": None, "timestamp": None} try: # Get the specified revision revision = pages[page_id]["revisions"][revision] revid = revision["revid"] timestamp = revision["timestamp"] except: # Revision not found, return empty dict return {"revid": None, "timestamp": None} # NOTUSED: Create permanent URL # permanent_url = f"https://en.wikipedia.org/w/index.php?title={title}&oldid={revid}" # Remove the parentid key because we don't use it _ = revision.pop("parentid", None) return revision def get_revision_from_age(title: str, age_days: int = 0) -> Dict[str, str]: """ Get the revision info of a Wikipedia article closest to the age in days. Args: title: Wikipedia article title (e.g., 'David_Szalay') age_days: Age of the article revision in days (0 for current) Returns: Dictionary containing: - 'revid': Revision id of the article revision - 'timestamp': Timestamp of the article revision """ # Get the target date target_date = datetime.utcnow() - timedelta(days=age_days) # Get the revision closest to the target date params = { "action": "query", "titles": title, "prop": "revisions", "rvlimit": 1, "rvdir": "older", "rvstart": target_date.isoformat() + "Z", "rvprop": "ids|timestamp", "format": "json", } # Run GET request json_data = run_get_request(params) # Return revision info return extract_revision_info(json_data) def get_previous_revisions(title: str, revisions: int = 0) -> Dict[str, str]: """ Get the revision info of a Wikipedia article a certain number of revisions before the current one. Args: title: Wikipedia article title (e.g., 'David_Szalay') revision: What revision before current (0 for current, must be between 0 and 499) Returns: Dictionary containing: - 'revid': Revision id of the article revision - 'timestamp': Timestamp of the article revision Note: In the Wikipedia API, rvlimit is how many revisions will be returned and must be between 1 and 500 rvlimit = 1 returns a single revision: the current one rvlimit = 101 returns the 100 most recent revisions and the current one This is why we use rvlimit = revision + 1 """ # Get the revision closest to the target date params = { "action": "query", "prop": "revisions", "titles": title, "rvlimit": revisions + 1, "rvdir": "older", "rvprop": "ids|timestamp", "format": "json", } # Run GET request json_data = run_get_request(params) # Return info for all revisions return json_data def get_wikipedia_introduction(revid: int) -> Dict[str, str]: """ Retrieve the introduction of a Wikipedia article. Args: revid: Revision id of the article Returns: Text of the introduction Example: # Get intro from current article revision revision_info = get_revision_from_age("David_Szalay") get_wikipedia_introduction(revision_info["revid"]) """ # Return None for missing revid if not revid: return None # Get the content of this specific revision params = {"action": "parse", "oldid": revid, "prop": "text", "format": "json"} json_data = run_get_request(params) # Sometimes a revision is deleted and can't be viewed # E.g. revid = '1276494621' for Turin try: html_content = json_data["parse"]["text"]["*"] except: return None # Extract introduction (text before first section heading) # Remove everything from the first