diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 1f5afb9..147c062 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -7,6 +7,13 @@ Change Log ------ +3.0.3 +===== + + +* Added new wrangling notebook 16_check_publication_in_GEO.ipynb to check for associated GEO datasets for a given list of PMIDs. + + 3.0.2 ===== diff --git a/notebooks/useful_notebooks/16_check_publication_in_GEO.ipynb b/notebooks/useful_notebooks/16_check_publication_in_GEO.ipynb new file mode 100644 index 0000000..e03e5b2 --- /dev/null +++ b/notebooks/useful_notebooks/16_check_publication_in_GEO.ipynb @@ -0,0 +1,142 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "d4da4f53-c565-4f8f-8c06-fe4884db3a52", + "metadata": {}, + "source": [ + "### PLEASE COPY NOTEBOOKS TO YOUR FOLDERS TO PREVENT COMMIT CONFLICTS\n", + "\n", + "#### For a given PMID, provides associated GEO dataset ID(s) (GSEXXX)\n", + "\n", + "#### Protocol:\n", + "\n", + "#### Register for a account in NCBI and copy the API KEY provided in NCBI account > NCBI Account Settings\n", + "#### Provide a file with list of publications, just numbers no prefix. \n", + "\n", + "#### NOTE:\n", + "#### Needs Biopython module\n", + "#### Entrez allows max 10 requests per second (If API key is provided) otherwise the limit is 3 requests/second. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "62ec2562-0707-4d78-9f59-bedb6c1812cc", + "metadata": {}, + "outputs": [], + "source": [ + "from Bio import Entrez\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7d66e411-3c92-4855-a8a7-877b473edf20", + "metadata": {}, + "outputs": [], + "source": [ + "# Set up NCBI creds\n", + "\n", + "Entrez.email = \"\" # Email associated with your NCBI account\n", + "Entrez.api_key = \"\" # API key can be copied from your NCBI account > NCBI Account Settings\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fed6fe3-b282-43bc-862d-bbbb677165a3", + "metadata": {}, + "outputs": [], + "source": [ + "def get_gse(pmid_accession_list):\n", + " no_gse = False\n", + " \n", + " for pmid in pmid_accession_list:\n", + " handle_gds_uid = Entrez.elink(dbfrom=\"pubmed\", id=pmid, db=\"gds\") #get GDS UID (which is different from GSE accession)\n", + " record_gds_uid = Entrez.read(handle_gds_uid)\n", + " handle_gds_uid.close()\n", + " if len(record_gds_uid[0][\"LinkSetDb\"]) == 0: #If there is no GEO datasets (GDS) with the PMID\n", + " no_gse = True\n", + " gse_accession = \"Not (yet) in GEO\"\n", + " elif len(record_gds_uid[0]['LinkSetDb'][0]['Link']) >= 1: #If PMID is associated with GEO Datasets (GDS)\n", + " gds_uids = []\n", + " list_gds_acc = record_gds_uid[0]['LinkSetDb'][0]['Link']\n", + " for gse_acc in list_gds_acc:\n", + " gds_uids.append(gse_acc['Id'])\n", + " for ids in gds_uids:\n", + " handle2_gse_acc = Entrez.esummary(db=\"gds\", id=ids) # convert GDS UID to GSE accession\n", + " record2_gse_acc = Entrez.read(handle2_gse_acc)\n", + " handle2_gse_acc.close()\n", + " gse_accession = record2_gse_acc[0]['Accession']\n", + " print(pmid,gse_accession)\n", + " time.sleep(1) \n", + " \n", + " if no_gse:\n", + " print(pmid, gse_accession)\n", + " time.sleep(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e45d94eb-0e06-491c-b05c-39589e682fd8", + "metadata": {}, + "outputs": [], + "source": [ + "# File with a list of PMIDs - one on each line.\n", + "\n", + "input_file = ''\n", + "chunks = 5 #Splits the PMID list into specified no. of values per list\n", + "\n", + "with open(input_file) as file1:\n", + "\n", + " all_pmids = []\n", + " for pmid in file1:\n", + " pmid = pmid.rstrip()\n", + " all_pmids.append(str(pmid))\n", + "\n", + "for count in range(0,len(all_pmids),chunks):\n", + " pmid_accession_list = all_pmids[count:count+chunks]\n", + " get_gse(pmid_accession_list)\n", + " time.sleep(2)\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "257f63a4-f5e4-4d37-9c4b-68e17ff0634b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + }, + "vscode": { + "interpreter": { + "hash": "20d91ac981d81ffaf62d1b59390659afd05fbcd41a0bec0e13249d20dc131a1e" + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index 45c3ee9..fb51975 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicwrangling" -version = "3.0.2" +version = "3.0.3" description = "Scripts and Jupyter notebooks for 4DN wrangling" authors = ["4DN-DCIC Team "] license = "MIT"