Spaces:
Sleeping
Sleeping
| import requests | |
| import time | |
| import json | |
| import sys | |
| from pathlib import Path | |
| from tqdm import tqdm | |
| import pandas as pd | |
| MAIL_TO = "[email protected]" | |
| def get_openalex_ids(dois, batch_size=50): | |
| """Retrieve the OpenAlex IDs for a list of DOIs.""" | |
| results = {} | |
| for i in range(0, len(dois), batch_size): | |
| batch = dois[i:i+batch_size] | |
| pipe_separated_dois = "|".join(batch) | |
| url = f"https://api.openalex.org/works?filter=doi:{pipe_separated_dois}&per-page={batch_size}&select=id,doi&mailto={MAIL_TO}" | |
| response = requests.get(url) | |
| time.sleep(0.1) # Respect API rate limits | |
| if response.status_code == 200: | |
| data = response.json().get("results", []) | |
| for a in data: | |
| results[a.get("doi").replace("https://doi.org/","")] = a.get("id") | |
| else: | |
| print(f"response failed with code: {response.status_code}") | |
| return results | |
| def get_outgoing_citations(openalex_id): | |
| """Retrieve the list of outgoing citations for multiple articles given their OpenAlex IDs.""" | |
| url = ( | |
| f"https://api.openalex.org/works?filter=cited_by:{openalex_id}" | |
| f"&select=id,doi,title,keywords,authorships,abstract_inverted_index,publication_year,primary_location,language" | |
| f"&per-page=200" | |
| f"&mailto={MAIL_TO}" | |
| ) | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| results = response.json().get("results", []) | |
| return results | |
| else: | |
| print(f"response failed with code: {response.status_code}") | |
| return [] | |
| def extract_citation_data(citing_articles): | |
| """Extracts relevant metadata from the citing articles.""" | |
| citations = [] | |
| for article in citing_articles: | |
| citations.append({ | |
| "id": article.get("id"), | |
| "doi": article.get("doi"), | |
| "title": article.get("title"), | |
| "authors": [ | |
| {"name": author.get("author", {}).get("display_name"), "id": author.get("author", {}).get("id")} | |
| for author in article.get("authorships", []) | |
| ], | |
| "abstract": article.get("abstract_inverted_index"), | |
| "year": article.get("publication_year"), | |
| "venue": article.get("primary_location", {}).get("source", {}).get("display_name"), | |
| "language": article.get("language") | |
| }) | |
| return citations | |
| def fetch_citations_for_dois(doi_list): | |
| """Main function to fetch outgoing citations for a list of DOIs.""" | |
| all_citations = {} | |
| openalex_ids = get_openalex_ids(doi_list) | |
| print(len(openalex_ids)) | |
| for doi, oa_id in tqdm(openalex_ids.items()): | |
| all_citations[doi] = get_outgoing_citations(oa_id) | |
| if len(all_citations[doi]) == 200: | |
| print(">= 200 citations:", doi, oa_id) | |
| time.sleep(0.1) # Respect API rate limits | |
| return all_citations | |
| def save_to_file(citations, fn): | |
| # Save to a JSON file | |
| with open(fn, "w") as f: | |
| json.dump(citations, f) | |
| if __name__ == "__main__": | |
| # Example usage | |
| data = pd.read_parquet(sys.argv[1]) | |
| doi_list = data["OriginalPaperDOI"] | |
| dois_w_fulltext = [] | |
| for doi in doi_list: | |
| md_fn = doi.replace("https://doi.org/", "").replace("/", "|") + ".md" | |
| if "retraction" in sys.argv[1]: | |
| dir_up = Path("/mnt/data1/retraction_data/pdf_articles_unpaywall_md") | |
| dir_oa = Path("/mnt/data1/retraction_data/pdf_articles_md") | |
| dir_man = Path("/mnt/data1/retraction_data/pdf_articles_manual_md") | |
| if (dir_up/md_fn).exists() or (dir_oa/md_fn).exists() or (dir_man/md_fn).exists(): | |
| dois_w_fulltext.append(doi) | |
| elif "reference" in sys.argv[1]: | |
| dir = Path("/mnt/data1/retraction_data/pdf_articles_reference_md") | |
| if (dir/md_fn).exists(): | |
| dois_w_fulltext.append(doi) | |
| else: | |
| print("Can't find any markdown files for these DOI's.") | |
| # dois_w_fulltext = dois_w_fulltext[:101] | |
| print(f"Fetching outgoing citations for {len(dois_w_fulltext)} articles.") | |
| out_fn = sys.argv[2] | |
| citations_data = fetch_citations_for_dois(dois_w_fulltext) | |
| save_to_file(citations_data, out_fn) | |
| print(f"Citations data saved to {out_fn}") | |