import requests from datetime import datetime, timedelta from typing import Dict, Optional import re def run_get_request(params: dict): """ Utility function to run GET request against Wikipedia API """ base_url = "https://en.wikipedia.org/w/api.php" # We need to supply headers for the request to work headers = { "User-Agent": f"NoteworthyDifferences/1.0 (j3ffdick@gmail.com) requests/{requests.__version__}" } response = requests.get(base_url, params=params, headers=headers) # Handle HTTP errors response.raise_for_status() try: json_data = response.json() except Exception: raise ValueError(f"Unable to parse response: {response}") return json_data def extract_revision_info(json_data, revision=0): """ Utility function to extract page revision info from JSON data returned from API call Args: revision: revision before current Examples: title = 'David_Szalay' json_data = get_previous_revisions(title, revisions = 100) extract_revision_info(json_data) # Current revision extract_revision_info(json_data, 10) # 10th revision before current extract_revision_info(json_data, 100) # 10th revision before current """ # Extract page and revision info pages = json_data["query"]["pages"] page_id = list(pages.keys())[0] if page_id == "-1": # Page not found, return empty dict return {"revid": None, "timestamp": None} try: # Get the specified revision revision = pages[page_id]["revisions"][revision] revid = revision["revid"] timestamp = revision["timestamp"] except: # Revision not found, return empty dict return {"revid": None, "timestamp": None} # NOTUSED: Create permanent URL # permanent_url = f"https://en.wikipedia.org/w/index.php?title={title}&oldid={revid}" # Remove the parentid key because we don't use it _ = revision.pop("parentid", None) return revision def get_revision_from_age(title: str, age_days: int = 0) -> Dict[str, str]: """ Get the revision info of a Wikipedia article closest to the age in days. Args: title: Wikipedia article title (e.g., 'David_Szalay') age_days: Age of the article revision in days (0 for current) Returns: Dictionary containing: - 'revid': Revision id of the article revision - 'timestamp': Timestamp of the article revision """ # Get the target date target_date = datetime.utcnow() - timedelta(days=age_days) # Get the revision closest to the target date params = { "action": "query", "titles": title, "prop": "revisions", "rvlimit": 1, "rvdir": "older", "rvstart": target_date.isoformat() + "Z", "rvprop": "ids|timestamp", "format": "json", } # Run GET request json_data = run_get_request(params) # Return revision info return extract_revision_info(json_data) def get_previous_revisions(title: str, revisions: int = 0) -> Dict[str, str]: """ Get the revision info of a Wikipedia article a certain number of revisions before the current one. Args: title: Wikipedia article title (e.g., 'David_Szalay') revision: What revision before current (0 for current, must be between 0 and 499) Returns: Dictionary containing: - 'revid': Revision id of the article revision - 'timestamp': Timestamp of the article revision Note: In the Wikipedia API, rvlimit is how many revisions will be returned and must be between 1 and 500 rvlimit = 1 returns a single revision: the current one rvlimit = 101 returns the 100 most recent revisions and the current one This is why we use rvlimit = revision + 1 """ # Get the revision closest to the target date params = { "action": "query", "prop": "revisions", "titles": title, "rvlimit": revisions + 1, "rvdir": "older", "rvprop": "ids|timestamp", "format": "json", } # Run GET request json_data = run_get_request(params) # Return info for all revisions return json_data def get_wikipedia_introduction(revid: int) -> Dict[str, str]: """ Retrieve the introduction of a Wikipedia article. Args: revid: Revision id of the article Returns: Text of the introduction Example: # Get intro from current article revision revision_info = get_revision_from_age("David_Szalay") get_wikipedia_introduction(revision_info["revid"]) """ # Return None for missing revid if not revid: return None # Get the content of this specific revision params = {"action": "parse", "oldid": revid, "prop": "text", "format": "json"} json_data = run_get_request(params) # Sometimes a revision is deleted and can't be viewed # E.g. revid = '1276494621' for Turin try: html_content = json_data["parse"]["text"]["*"] except: return None # Extract introduction (text before first section heading) # Remove everything from the first

tag onwards intro_html = re.split(r" int: """ Get the number of revisions a given revid is behind the current revision of the page. Args: revid: Revision ID of the page Returns: Integer representing the number of revisions back (0 if it's the current revision) Example: # Get how many revisions behind a specific revid is revisions_behind = get_revisions_behind(123456789) """ ## First, get the page title from the revid # params = {"action": "parse", "oldid": revid, "prop": "title", "format": "json"} # try: # json_data = run_get_request(params) # title = json_data["parse"]["title"] # except Exception: # # If we can't get the title, the revid might be invalid # raise ValueError(f"Could not retrieve page title for revid {revid}. The revid may be invalid or deleted.") # Search through revisions going back from current # We'll paginate through results if needed revision_count = 0 continue_token = None # Run the loop twice to get up to 1000 revisions behind for i in range(2): params = { "action": "query", "titles": title, "prop": "revisions", "rvlimit": 500, # API limit per request "rvdir": "older", "rvprop": "ids", "format": "json", } if continue_token: params["rvcontinue"] = continue_token try: json_data = run_get_request(params) pages = json_data["query"]["pages"] page_id = list(pages.keys())[0] if page_id == "-1": raise ValueError(f"Page not found for revid {revid}") revisions = pages[page_id]["revisions"] # Find the index of the given revid in the current batch of revisions for i, revision in enumerate(revisions): if revision["revid"] == revid: return revision_count + i # Update the count of revisions we've checked revision_count += len(revisions) # Check if there are more revisions to search continue_token = json_data.get("continue", {}).get("rvcontinue") if not continue_token: # Reached the end of revisions but didn't find the revid raise ValueError( f"Revid {revid} not found in the revision history of the page. " f"It may be from a different page or may have been deleted." ) except ValueError: # Re-raise ValueError exceptions raise except Exception as e: raise ValueError(f"Error searching for revid {revid}: {e}") # If we looped without returning the revision count, return it as a negative number negative_revision_count = -revision_count return negative_revision_count def get_random_wikipedia_title(): url = "https://en.wikipedia.org/w/api.php" params = { "action": "query", "list": "random", "rnnamespace": 0, "rnlimit": 1, "format": "json", } try: json_data = run_get_request(params) # Extract the title title = json_data["query"]["random"][0]["title"] return title except requests.RequestException as e: print(f"Error fetching random Wikipedia title: {e}") return None