Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,13 @@ Change Log
------


3.0.3
=====


* Added new wrangling notebook 16_check_publication_in_GEO.ipynb to check for associated GEO datasets for a given list of PMIDs.


3.0.2
=====

Expand Down
137 changes: 137 additions & 0 deletions notebooks/useful_notebooks/16_check_publication_in_GEO.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "d4da4f53-c565-4f8f-8c06-fe4884db3a52",
"metadata": {},
"source": [
"### PLEASE COPY NOTEBOOKS TO YOUR FOLDERS TO PREVENT COMMIT CONFLICTS\n",
"\n",
"#### For a given PMID, provides associated GEO dataset ID(s) (GSEXXX)\n",
"\n",
"#### Protocol:\n",
"\n",
"#### Register for a account in NCBI and copy the API KEY provided in NCBI account > NCBI Account Settings\n",
"#### Provide a file with list of publications, just numbers no prefix. \n",
"\n",
"#### NOTE:\n",
"#### Needs Biopython module\n",
"#### Entrez allows max 10 requests per second (If API key is provided) otherwise the limit is 3 requests/second. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62ec2562-0707-4d78-9f59-bedb6c1812cc",
"metadata": {},
"outputs": [],
"source": [
"from Bio import Entrez\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7d66e411-3c92-4855-a8a7-877b473edf20",
"metadata": {},
"outputs": [],
"source": [
"# Set up NCBI creds\n",
"\n",
"Entrez.email = \"\" # Email associated with your NCBI account\n",
"Entrez.api_key = \"\" # API key can be copied from your NCBI account > NCBI Account Settings\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3fed6fe3-b282-43bc-862d-bbbb677165a3",
"metadata": {},
"outputs": [],
"source": [
"def get_gse(pmid_accession_list):\n",
" no_gse = False\n",
" \n",
" for pmid in pmid_accession_list:\n",
" handle_gds_uid = Entrez.elink(dbfrom=\"pubmed\", id=pmid, db=\"gds\") #get GDS UID (which is different from GSE accession)\n",
" record_gds_uid = Entrez.read(handle_gds_uid)\n",
" handle_gds_uid.close()\n",
" if len(record_gds_uid[0][\"LinkSetDb\"]) == 0: #If there is no GEO datasets (GDS) with the PMID\n",
" no_gse = True\n",
" gse_accession = \"Not (yet) in GEO\"\n",
" elif len(record_gds_uid[0]['LinkSetDb'][0]['Link']) >= 1: #If PMID is associated with GEO Datasets (GDS)\n",
" gds_uids = []\n",
" list_gds_acc = record_gds_uid[0]['LinkSetDb'][0]['Link']\n",
" for gse_acc in list_gds_acc:\n",
" gds_uids.append(gse_acc['Id'])\n",
" for ids in gds_uids:\n",
" handle2_gse_acc = Entrez.esummary(db=\"gds\", id=ids) # convert GDS UID to GSE accession\n",
" record2_gse_acc = Entrez.read(handle2_gse_acc)\n",
" handle2_gse_acc.close()\n",
" gse_accession = record2_gse_acc[0]['Accession']\n",
" print(pmid,gse_accession)\n",
" time.sleep(5) \n",
" \n",
" if no_gse:\n",
" print(pmid, gse_accession)\n",
" time.sleep(7)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e45d94eb-0e06-491c-b05c-39589e682fd8",
"metadata": {},
"outputs": [],
"source": [
"# File with a list of PMIDs - one on each line.\n",
"\n",
"input_file = ''\n",
"chunks = 5 #Splits the PMID list into specified no. of values per list\n",
"\n",
"with open(input_file) as file1:\n",
"\n",
" all_pmids = []\n",
" for pmid in file1:\n",
" pmid = pmid.rstrip()\n",
" all_pmids.append(str(pmid))\n",
"\n",
"for count in range(0,len(all_pmids),chunks):\n",
" pmid_accession_list = all_pmids[count:count+chunks]\n",
" get_gse(pmid_accession_list)\n",
" time.sleep(5)\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "257f63a4-f5e4-4d37-9c4b-68e17ff0634b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "dcicwrangling"
version = "3.0.2"
version = "3.0.3"
description = "Scripts and Jupyter notebooks for 4DN wrangling"
authors = ["4DN-DCIC Team <[email protected]>"]
license = "MIT"
Expand Down