|
| 1 | +import pandas as pd |
| 2 | +import os |
| 3 | + |
| 4 | + |
| 5 | +path = './data/alzkb_v2-populated.csv' |
| 6 | +df= pd.read_csv(path) |
| 7 | +df= pd.concat([df,pd.DataFrame(columns=['sourceDB','unbiased','affinity_nM','p_fisher','z_score','correlation','score','confidence'])]) |
| 8 | + |
| 9 | +# hetionet-custom-edges.tsv |
| 10 | +data_dir = "./AlzKB_Raw_Data" |
| 11 | +hetionet_custom = pd.read_table(os.path.join(data_dir,'hetionet/hetionet-custom-edges.tsv')) |
| 12 | + |
| 13 | +hetio_custom = { |
| 14 | + 'CbG':'CHEMICALBINDSGENE', |
| 15 | + 'DrD':'DISEASEASSOCIATESWITHDISEASE', # no results |
| 16 | + 'DlA':'DISEASELOCALIZESTOANATOMY', |
| 17 | + 'DpS':'SYMPTOMMANIFESTATIONOFDISEASE' |
| 18 | +} |
| 19 | + |
| 20 | + |
| 21 | +affinity_nM = hetionet_custom[hetionet_custom['metaedge']=='CbG'] |
| 22 | +affinity_nM['xrefDrugbank'] = affinity_nM['source'].str.split('::').str[-1] |
| 23 | +affinity_nM['xrefNcbiGene'] = affinity_nM['target'].str.split('::').str[-1].astype(int) |
| 24 | +affinity_nM = affinity_nM.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left') |
| 25 | +affinity_nM = affinity_nM.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left') |
| 26 | +affinity_nM['_type'] = hetio_custom['CbG'] |
| 27 | +merged_df = df.merge(affinity_nM, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') |
| 28 | +for column in ['sourceDB', 'unbiased', 'affinity_nM']: |
| 29 | + df[column] = merged_df[column + '_new'].combine_first(df[column]) |
| 30 | +df.shape |
| 31 | + |
| 32 | + |
| 33 | +disgenet = pd.read_table('./AlzKB_Raw_Data/disgenet/CUSTOM/disease_mappings_alzheimer.tsv') |
| 34 | +disgenet = disgenet[disgenet['vocabulary']=='DO'] |
| 35 | + |
| 36 | + |
| 37 | +p_fisher_DlA = hetionet_custom[hetionet_custom['metaedge']=='DlA'] |
| 38 | + |
| 39 | +p_fisher_DlA['do_id'] = p_fisher_DlA['source'].str.split('::').str[-1].str.split(':').str[-1] |
| 40 | +p_fisher_DlA['xrefUberon'] = p_fisher_DlA['target'].str.split('::').str[-1] |
| 41 | + |
| 42 | +p_fisher_DlA = p_fisher_DlA.merge(disgenet, left_on='do_id', right_on= 'code') |
| 43 | +p_fisher_DlA['_start'] = 'disease_'+p_fisher_DlA['diseaseId'].str.lower() |
| 44 | +p_fisher_DlA = p_fisher_DlA.merge(df[['_id','xrefUberon']].rename(columns={'_id':'_end'}), on='xrefUberon', how='left') |
| 45 | +p_fisher_DlA['_type'] = hetio_custom['DlA'] |
| 46 | + |
| 47 | +p_fisher_DpS = hetionet_custom[hetionet_custom['metaedge']=='DpS'] |
| 48 | + |
| 49 | +p_fisher_DpS['xrefMeSH'] = p_fisher_DpS['target'].str.split('::').str[-1] |
| 50 | +p_fisher_DpS['do_id'] = p_fisher_DpS['source'].str.split('::').str[-1].str.split(':').str[-1] |
| 51 | + |
| 52 | +p_fisher_DpS = p_fisher_DpS.merge(df[['_id','xrefMeSH']].rename(columns={'_id':'_start'}), on='xrefMeSH', how='left') |
| 53 | +p_fisher_DpS = p_fisher_DpS.merge(disgenet, left_on='do_id', right_on= 'code') |
| 54 | +p_fisher_DpS['_end'] = 'disease_'+p_fisher_DpS['diseaseId'].str.lower() |
| 55 | +p_fisher_DpS['_type'] = hetio_custom['DpS'] |
| 56 | + |
| 57 | +p_fisher = pd.concat([p_fisher_DlA, p_fisher_DpS]) |
| 58 | + |
| 59 | +merged_df = df.merge(p_fisher, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') |
| 60 | +for column in ['sourceDB', 'unbiased', 'p_fisher']: |
| 61 | + df[column] = merged_df[column + '_new'].combine_first(df[column]) |
| 62 | +df.shape |
| 63 | + |
| 64 | + |
| 65 | +# hetionet-v1.0-edges.sif |
| 66 | +#https://github.com/dhimmel/integrate/blob/master/integrate.ipynb |
| 67 | + |
| 68 | +import hetio.hetnet |
| 69 | +import hetio.readwrite |
| 70 | +import hetio.stats |
| 71 | + |
| 72 | +path = 'https://raw.githubusercontent.com/dhimmel/integrate/master/data/hetnet.json.bz2' |
| 73 | +graph = hetio.readwrite.read_graph(path, formatting=None) |
| 74 | + |
| 75 | + |
| 76 | +#https://github.com/hetio/hetnetpy/blob/main/hetnetpy/readwrite.py |
| 77 | +import collections |
| 78 | +import operator |
| 79 | +import pandas as pd |
| 80 | + |
| 81 | +def write_nodetable(graph): |
| 82 | + """Write a tabular encoding of the graph nodes.""" |
| 83 | + rows = list() |
| 84 | + for node in graph.node_dict.values(): |
| 85 | + row = collections.OrderedDict() |
| 86 | + row["kind"] = node.metanode.identifier |
| 87 | + row["id"] = str(node) |
| 88 | + row["name"] = node.name |
| 89 | + row["source"] = node.data['source'] |
| 90 | + rows.append(row) |
| 91 | + rows.sort(key=operator.itemgetter("kind", "id")) |
| 92 | + fieldnames = ["id", "name", "kind", "source"] |
| 93 | + df_nodes_tsv = pd.DataFrame(rows, columns=fieldnames) |
| 94 | + print(df_nodes_tsv.shape) |
| 95 | + return df_nodes_tsv |
| 96 | + |
| 97 | + |
| 98 | +def write_edgetable(graph): |
| 99 | + """Write a tsv of the graph edges.""" |
| 100 | + rows = list() |
| 101 | + edge_properties=["sourceDB", "unbiased", "affinity_nM", "z_score", "p_fisher", "correlation"] |
| 102 | + fieldnames =["source", "metaedge", "target"] |
| 103 | + fieldnames = fieldnames+edge_properties |
| 104 | + metaedge_to_edges = graph.get_metaedge_to_edges(exclude_inverts=True) |
| 105 | + for metaedge, edges in metaedge_to_edges.items(): |
| 106 | + for edge in edges: |
| 107 | + row = collections.OrderedDict() |
| 108 | + row["source"] = edge.source |
| 109 | + row["metaedge"] = edge.metaedge.abbrev |
| 110 | + row["target"] = edge.target |
| 111 | + for pro in edge_properties: |
| 112 | + if pro =='sourceDB': |
| 113 | + if 'source' in edge.data.keys(): |
| 114 | + row[pro]=edge.data['source'] |
| 115 | + else: |
| 116 | + row[pro]=None |
| 117 | + else: |
| 118 | + if pro in edge.data.keys(): |
| 119 | + row[pro]=edge.data[pro] |
| 120 | + else: |
| 121 | + row[pro]=None |
| 122 | + rows.append(row) |
| 123 | + df_edges_tsv = pd.DataFrame(rows, columns=fieldnames) |
| 124 | + print(df_edges_tsv.shape) |
| 125 | + return df_edges_tsv |
| 126 | + |
| 127 | +hetionet = write_edgetable(graph) |
| 128 | +hetionet['source']=hetionet['source'].astype(str) |
| 129 | +hetionet['target']=hetionet['target'].astype(str) |
| 130 | +hetionet |
| 131 | + |
| 132 | +hetio = { |
| 133 | + 'CuG':'CHEMICALINCREASESEXPRESSION', |
| 134 | + 'CdG':'CHEMICALDECREASESEXPRESSION', |
| 135 | + 'GcG':'GENECOVARIESWITHGENE', |
| 136 | + 'Gr>G':'GENEREGULATESGENE' |
| 137 | +} |
| 138 | + |
| 139 | + |
| 140 | +z_score = hetionet[hetionet['metaedge']=='CuG'] |
| 141 | +z_score['xrefDrugbank'] = z_score['source'].str.split('::').str[-1] |
| 142 | +z_score['xrefNcbiGene'] = z_score['target'].str.split('::').str[-1].astype(int) |
| 143 | + |
| 144 | +z_score = z_score.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left') |
| 145 | +z_score = z_score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left') |
| 146 | +z_score['_type'] = hetio['CuG'] |
| 147 | + |
| 148 | +z_score_all = z_score |
| 149 | + |
| 150 | +z_score = hetionet[hetionet['metaedge']=='CdG'] |
| 151 | +z_score['xrefDrugbank'] = z_score['source'].str.split('::').str[-1] |
| 152 | +z_score['xrefNcbiGene'] = z_score['target'].str.split('::').str[-1].astype(int) |
| 153 | + |
| 154 | +z_score = z_score.merge(df[['_id','xrefDrugbank']].rename(columns={'_id':'_start'}), on='xrefDrugbank', how='left') |
| 155 | +z_score = z_score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), on='xrefNcbiGene', how='left') |
| 156 | +z_score['_type'] = hetio['CdG'] |
| 157 | + |
| 158 | +z_score_all = pd.concat([z_score_all,z_score]) |
| 159 | + |
| 160 | +merged_df = df.merge(z_score_all, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') |
| 161 | +for column in ['sourceDB', 'unbiased', 'z_score']: |
| 162 | + df[column] = merged_df[column + '_new'].combine_first(df[column]) |
| 163 | +df.shape |
| 164 | + |
| 165 | + |
| 166 | +correlation = pd.read_table(os.path.join(data_dir,'hetionet/geneCovariesWithGene_correlation.tsv')) |
| 167 | + |
| 168 | +correlation = correlation.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_start'}), left_on='source_entrez', right_on='xrefNcbiGene', how='left') |
| 169 | +correlation = correlation.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_end'}), left_on='target_entrez', right_on='xrefNcbiGene', how='left') |
| 170 | +correlation['_type'] = hetio['GcG'] |
| 171 | +correlation['sourceDB'] = 'Hetionet - ERC' |
| 172 | +correlation['unbiased'] = True |
| 173 | + |
| 174 | +merged_df = df.merge(correlation, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') |
| 175 | +for column in ['sourceDB', 'unbiased', 'correlation']: |
| 176 | + df[column] = merged_df[column + '_new'].combine_first(df[column]) |
| 177 | +df.shape |
| 178 | +df.loc[~df['correlation'].isna()] |
| 179 | + |
| 180 | + |
| 181 | +#DisGeNET |
| 182 | +score = pd.read_table('./AlzKB_Raw_Data/disgenet/curated_gene_disease_associations.tsv') |
| 183 | +score['sourceDB'] = 'DisGeNET - '+score['source'] |
| 184 | + |
| 185 | +score = score.merge(df[['_id','xrefNcbiGene']].rename(columns={'_id':'_start'}), left_on='geneId', right_on='xrefNcbiGene', how='left') |
| 186 | +score['_end'] = 'disease_'+score['diseaseId'].str.lower() |
| 187 | +score['_type'] = 'GENEASSOCIATESWITHDISEASE' |
| 188 | + |
| 189 | +merged_df = df.merge(score, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') |
| 190 | +for column in ['sourceDB', 'score']: |
| 191 | + df[column] = merged_df[column + '_new'].combine_first(df[column]) |
| 192 | +df.shape |
| 193 | + |
| 194 | + |
| 195 | +#TF |
| 196 | +confidence = pd.read_table('./AlzKB_Raw_Data/dorothea/tf.tsv') |
| 197 | +confidence |
| 198 | + |
| 199 | +confidence = pd.read_table('./AlzKB_Raw_Data/dorothea/tf.tsv') |
| 200 | + |
| 201 | +confidence = confidence.merge(df[['_id','TF']].rename(columns={'_id':'_start'}), on='TF', how='left') |
| 202 | +confidence = confidence.merge(df[['_id','geneSymbol']].rename(columns={'_id':'_end'}), left_on='Gene', right_on='geneSymbol', how='left') |
| 203 | + |
| 204 | +confidence['_type'] = 'TRANSCRIPTIONFACTORINTERACTSWITHGENE' |
| 205 | + |
| 206 | +merged_df = df.merge(confidence, on=['_start', '_end', '_type'], suffixes=('', '_new'), how='left') |
| 207 | +for column in ['sourceDB', 'confidence']: |
| 208 | + df[column] = merged_df[column + '_new'].combine_first(df[column]) |
| 209 | +df.shape |
| 210 | + |
| 211 | +#save data file |
| 212 | +df.to_csv('./data/alzkb_v2.0.0_with_edge_properties.csv') |
| 213 | + |
| 214 | + |
| 215 | + |
0 commit comments