Skip to content

Commit 2d7664c

Browse files
committed
Update scripts for version 2.0
1 parent 082060e commit 2d7664c

9 files changed

+3245
-73
lines changed

alzkb/data/alzkb_v2.rdf

Lines changed: 2463 additions & 0 deletions
Large diffs are not rendered by default.

alzkb/populate_edge_weights.py

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,215 @@
1+
import pandas as pd
2+
import os
3+
4+
5+
path = './data/alzkb_v2-populated.csv'
6+
df= pd.read_csv(path)
7+
df= pd.concat([df,pd.DataFrame(columns=['sourceDB','unbiased','affinity_nM','p_fisher','z_score','correlation','score','confidence'])])
8+
9+
# hetionet-custom-edges.tsv
10+
data_dir = "./AlzKB_Raw_Data"
11+
hetionet_custom = pd.read_table(os.path.join(data_dir,'hetionet/hetionet-custom-edges.tsv'))
12+
13+
hetio_custom = {
14+
'CbG':'CHEMICALBINDSGENE',
15+
'DrD':'DISEASEASSOCIATESWITHDISEASE', # no results
16+
'DlA':'DISEASELOCALIZESTOANATOMY',
17+
'DpS':'SYMPTOMMANIFESTATIONOFDISEASE'
18+
}
19+
20+
21+
affinity_nM = hetionet_custom[hetionet_custom['metaedge']=='CbG']
22+
affinity_nM['xrefDrugbank'] = affinity_nM['source'].str.split('::').str[-1]
23+
affinity_nM['xrefNcbiGene'] = affinity_nM['target'].str.split('::').str[-1].astype(int)
24+
affinity_nM = affinity_nM.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left')
25+
affinity_nM = affinity_nM.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left')
26+
affinity_nM['_type'] = hetio_custom['CbG']
27+
merged_df = df.merge(affinity_nM, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left')
28+
for column in ['sourceDB', 'unbiased', 'affinity_nM']:
29+
df[column] = merged_df[column + '_new'].combine_first(df[column])
30+
df.shape
31+
32+
33+
disgenet = pd.read_table('./AlzKB_Raw_Data/disgenet/CUSTOM/disease_mappings_alzheimer.tsv')
34+
disgenet = disgenet[disgenet['vocabulary']=='DO']
35+
36+
37+
p_fisher_DlA = hetionet_custom[hetionet_custom['metaedge']=='DlA']
38+
39+
p_fisher_DlA['do_id'] = p_fisher_DlA['source'].str.split('::').str[-1].str.split(':').str[-1]
40+
p_fisher_DlA['xrefUberon'] = p_fisher_DlA['target'].str.split('::').str[-1]
41+
42+
p_fisher_DlA = p_fisher_DlA.merge(disgenet, left_on='do_id', right_on= 'code')
43+
p_fisher_DlA['_start'] = 'disease_'+p_fisher_DlA['diseaseId'].str.lower()
44+
p_fisher_DlA = p_fisher_DlA.merge(df[['_id','xrefUberon']].rename(columns={'_id':'_end'}), on='xrefUberon', how='left')
45+
p_fisher_DlA['_type'] = hetio_custom['DlA']
46+
47+
p_fisher_DpS = hetionet_custom[hetionet_custom['metaedge']=='DpS']
48+
49+
p_fisher_DpS['xrefMeSH'] = p_fisher_DpS['target'].str.split('::').str[-1]
50+
p_fisher_DpS['do_id'] = p_fisher_DpS['source'].str.split('::').str[-1].str.split(':').str[-1]
51+
52+
p_fisher_DpS = p_fisher_DpS.merge(df[['_id','xrefMeSH']].rename(columns={'_id':'_start'}), on='xrefMeSH', how='left')
53+
p_fisher_DpS = p_fisher_DpS.merge(disgenet, left_on='do_id', right_on= 'code')
54+
p_fisher_DpS['_end'] = 'disease_'+p_fisher_DpS['diseaseId'].str.lower()
55+
p_fisher_DpS['_type'] = hetio_custom['DpS']
56+
57+
p_fisher = pd.concat([p_fisher_DlA, p_fisher_DpS])
58+
59+
merged_df = df.merge(p_fisher, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left')
60+
for column in ['sourceDB', 'unbiased', 'p_fisher']:
61+
df[column] = merged_df[column + '_new'].combine_first(df[column])
62+
df.shape
63+
64+
65+
# hetionet-v1.0-edges.sif
66+
#https://github.com/dhimmel/integrate/blob/master/integrate.ipynb
67+
68+
import hetio.hetnet
69+
import hetio.readwrite
70+
import hetio.stats
71+
72+
path = 'https://raw.githubusercontent.com/dhimmel/integrate/master/data/hetnet.json.bz2'
73+
graph = hetio.readwrite.read_graph(path, formatting=None)
74+
75+
76+
#https://github.com/hetio/hetnetpy/blob/main/hetnetpy/readwrite.py
77+
import collections
78+
import operator
79+
import pandas as pd
80+
81+
def write_nodetable(graph):
82+
"""Write a tabular encoding of the graph nodes."""
83+
rows = list()
84+
for node in graph.node_dict.values():
85+
row = collections.OrderedDict()
86+
row["kind"] = node.metanode.identifier
87+
row["id"] = str(node)
88+
row["name"] = node.name
89+
row["source"] = node.data['source']
90+
rows.append(row)
91+
rows.sort(key=operator.itemgetter("kind", "id"))
92+
fieldnames = ["id", "name", "kind", "source"]
93+
df_nodes_tsv = pd.DataFrame(rows, columns=fieldnames)
94+
print(df_nodes_tsv.shape)
95+
return df_nodes_tsv
96+
97+
98+
def write_edgetable(graph):
99+
"""Write a tsv of the graph edges."""
100+
rows = list()
101+
edge_properties=["sourceDB", "unbiased", "affinity_nM", "z_score", "p_fisher", "correlation"]
102+
fieldnames =["source", "metaedge", "target"]
103+
fieldnames = fieldnames+edge_properties
104+
metaedge_to_edges = graph.get_metaedge_to_edges(exclude_inverts=True)
105+
for metaedge, edges in metaedge_to_edges.items():
106+
for edge in edges:
107+
row = collections.OrderedDict()
108+
row["source"] = edge.source
109+
row["metaedge"] = edge.metaedge.abbrev
110+
row["target"] = edge.target
111+
for pro in edge_properties:
112+
if pro =='sourceDB':
113+
if 'source' in edge.data.keys():
114+
row[pro]=edge.data['source']
115+
else:
116+
row[pro]=None
117+
else:
118+
if pro in edge.data.keys():
119+
row[pro]=edge.data[pro]
120+
else:
121+
row[pro]=None
122+
rows.append(row)
123+
df_edges_tsv = pd.DataFrame(rows, columns=fieldnames)
124+
print(df_edges_tsv.shape)
125+
return df_edges_tsv
126+
127+
hetionet = write_edgetable(graph)
128+
hetionet['source']=hetionet['source'].astype(str)
129+
hetionet['target']=hetionet['target'].astype(str)
130+
hetionet
131+
132+
hetio = {
133+
'CuG':'CHEMICALINCREASESEXPRESSION',
134+
'CdG':'CHEMICALDECREASESEXPRESSION',
135+
'GcG':'GENECOVARIESWITHGENE',
136+
'Gr>G':'GENEREGULATESGENE'
137+
}
138+
139+
140+
z_score = hetionet[hetionet['metaedge']=='CuG']
141+
z_score['xrefDrugbank'] = z_score['source'].str.split('::').str[-1]
142+
z_score['xrefNcbiGene'] = z_score['target'].str.split('::').str[-1].astype(int)
143+
144+
z_score = z_score.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left')
145+
z_score = z_score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left')
146+
z_score['_type'] = hetio['CuG']
147+
148+
z_score_all = z_score
149+
150+
z_score = hetionet[hetionet['metaedge']=='CdG']
151+
z_score['xrefDrugbank'] = z_score['source'].str.split('::').str[-1]
152+
z_score['xrefNcbiGene'] = z_score['target'].str.split('::').str[-1].astype(int)
153+
154+
z_score = z_score.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left')
155+
z_score = z_score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left')
156+
z_score['_type'] = hetio['CdG']
157+
158+
z_score_all = pd.concat([z_score_all,z_score])
159+
160+
merged_df = df.merge(z_score_all, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left')
161+
for column in ['sourceDB', 'unbiased', 'z_score']:
162+
df[column] = merged_df[column + '_new'].combine_first(df[column])
163+
df.shape
164+
165+
166+
correlation = pd.read_table(os.path.join(data_dir,'hetionet/geneCovariesWithGene_correlation.tsv'))
167+
168+
correlation = correlation.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_start'}), left_on='source_entrez', right_on='xrefNcbiGene', how='left')
169+
correlation = correlation.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), left_on='target_entrez', right_on='xrefNcbiGene', how='left')
170+
correlation['_type'] = hetio['GcG']
171+
correlation['sourceDB'] = 'Hetionet - ERC'
172+
correlation['unbiased'] = True
173+
174+
merged_df = df.merge(correlation, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left')
175+
for column in ['sourceDB', 'unbiased', 'correlation']:
176+
df[column] = merged_df[column + '_new'].combine_first(df[column])
177+
df.shape
178+
df.loc[~df['correlation'].isna()]
179+
180+
181+
#DisGeNET
182+
score = pd.read_table('./AlzKB_Raw_Data/disgenet/curated_gene_disease_associations.tsv')
183+
score['sourceDB'] = 'DisGeNET - '+score['source']
184+
185+
score = score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_start'}), left_on='geneId', right_on='xrefNcbiGene', how='left')
186+
score['_end'] = 'disease_'+score['diseaseId'].str.lower()
187+
score['_type'] = 'GENEASSOCIATESWITHDISEASE'
188+
189+
merged_df = df.merge(score, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left')
190+
for column in ['sourceDB', 'score']:
191+
df[column] = merged_df[column + '_new'].combine_first(df[column])
192+
df.shape
193+
194+
195+
#TF
196+
confidence = pd.read_table('./AlzKB_Raw_Data/dorothea/tf.tsv')
197+
confidence
198+
199+
confidence = pd.read_table('./AlzKB_Raw_Data/dorothea/tf.tsv')
200+
201+
confidence = confidence.merge(df[['_id','TF']].rename(columns={'_id':'_start'}), on='TF', how='left')
202+
confidence = confidence.merge(df[['_id','geneSymbol']].rename(columns={'_id':'_end'}), left_on='Gene', right_on='geneSymbol', how='left')
203+
204+
confidence['_type'] = 'TRANSCRIPTIONFACTORINTERACTSWITHGENE'
205+
206+
merged_df = df.merge(confidence, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left')
207+
for column in ['sourceDB', 'confidence']:
208+
df[column] = merged_df[column + '_new'].combine_first(df[column])
209+
df.shape
210+
211+
#save data file
212+
df.to_csv('./data/alzkb_v2.0.0_with_edge_properties.csv')
213+
214+
215+

0 commit comments

Comments
 (0)