::p_load(tidyverse, readxl, writexl, janitor, CellChat, ggvenn, biomaRt,
pacman
conflicted, tidytext, DT, kableExtra)setwd(this.path::here())
rm(list = ls())
conflict_prefer("select", "dplyr")
conflict_prefer("filter", "dplyr")
conflict_prefer("setdiff", "base")
# removes NA from all kable
options(knitr.kable.NA = "")
Finding Consensus with CellPhoneDBv5
This document is for outlining the mapping process of interactions generated from cell-cell communication (CCC) analysis - using both CellPhoneDBv5 and CellChatv2.
CellChat is used as the reference CCC tool, and we want to find out which interactions in CellChat are supported by CellPhoneDB. To ensure high confidence and unbiased mapping, we convert all gene symbols to their respective Uniprot IDs in both CCC tools, and do the same for the subunits of complexes.
1. Setup
1.1. R setup
1.2. Folder setup
sapply( c("data", "results"), dir.create, recursive = TRUE )
1.3. Download CellPhoneDB data files
These files were downloaded on 11th July 2025.
<- "https://raw.github.com/ventolab/cellphonedb-data/refs/heads/master"
basepath
# Main database containing the interaction
download.file( file.path(basepath, "cellphonedb.zip"), "data/cellphonedb.zip" )
unzip("data/cellphonedb.zip", exdir = "data", junkpaths = TRUE)
# Additional files for mapping
download.file( file.path(basepath, "data/complex_input.csv"), "data/complex_input.csv" )
download.file( file.path(basepath, "data/protein_input.csv"), "data/protein_input.csv" )
download.file( file.path(basepath, "data/sources/uniprot_synonyms.tsv"), "data/uniprot_synonyms.tsv" )
2. Complex Mapping
2.1. Mapping CellPhoneDB complexes to Uniprot IDs
<- read_csv("data/complex_input.csv") %>%
CPDB.complex2protein select(complex_name, starts_with("uniprot"))
<- c("12oxoLeukotrieneB4_byPTGR1", "TREM2_receptor", "RAreceptor_RARA_RXRA",
example "Pregnenolone_byCYP11A1", "5HTR3_complex")
%>%
CPDB.complex2protein filter(complex_name %in% example) %>%
mutate(complex_name = factor(complex_name, levels = example)) %>%
arrange(complex_name) %>%
::kable() knitr
complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 |
---|---|---|---|---|---|
12oxoLeukotrieneB4_byPTGR1 | Q14914 | ||||
TREM2_receptor | Q9NZC2 | O43914 | |||
RAreceptor_RARA_RXRA | P10276 | P19793 | P29373 | ||
Pregnenolone_byCYP11A1 | P05108 | Q6P4F2 | P10109 | P22570 | |
5HTR3_complex | P46098 | O95264 | Q8WXA8 | Q70Z44 | A5X5Y0 |
%>%
CPDB.complex2protein mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
tabyl(nproteins_per_complex) %>%
::kable(align = rep("l", nrow(.))) knitr
nproteins_per_complex | n | percent |
---|---|---|
1 | 61 | 0.1685083 |
2 | 261 | 0.7209945 |
3 | 36 | 0.0994475 |
4 | 3 | 0.0082873 |
5 | 1 | 0.0027624 |
There are 362 complexes and each complex can include up to 5 proteins (but typically 2).
2.2. Add a unique key for matching
To create the complex key, we concatenated the subunits of heteromeric complexes into a vector, removed NAs within, and sorted the subunits before collapsing them into a single character value. We found that sorting eases the mapping process computationally between CellPhoneDB and CellChat (Before sort: < 250 mapped; After sort: > 290 mapped).
<- CPDB.complex2protein %>%
CPDB.complex2protein rowwise() %>%
mutate(complex_key = paste(
sort(na.omit(c(uniprot_1, uniprot_2, uniprot_3, uniprot_4, uniprot_5))),
collapse = "_")) %>%
ungroup()
%>%
CPDB.complex2protein filter(complex_name %in% example) %>%
mutate(complex_name = factor(complex_name, levels = example)) %>%
arrange(complex_name) %>%
::kable() knitr
complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | complex_key |
---|---|---|---|---|---|---|
12oxoLeukotrieneB4_byPTGR1 | Q14914 | Q14914 | ||||
TREM2_receptor | Q9NZC2 | O43914 | O43914_Q9NZC2 | |||
RAreceptor_RARA_RXRA | P10276 | P19793 | P29373 | P10276_P19793_P29373 | ||
Pregnenolone_byCYP11A1 | P05108 | Q6P4F2 | P10109 | P22570 | P05108_P10109_P22570_Q6P4F2 | |
5HTR3_complex | P46098 | O95264 | Q8WXA8 | Q70Z44 | A5X5Y0 | A5X5Y0_O95264_P46098_Q70Z44_Q8WXA8 |
2.3. Mapping CellChatDB complexes to Uniprot IDs
CellChat version 2.2.0 stores the database within R under CellChatDB.human
.
::p_version("CellChat") pacman
[1] '2.2.0'
<- CellChatDB.human$complex %>%
CCDB.complex2gene mutate(complex_name = rownames(.)) %>%
rename_with( ~ gsub("subunit", "symbol", .)) %>%
relocate(complex_name) %>%
remove_rownames()
<- c("12oxoLTB4-PTGR1", "TREM2_TYROBP", "RARA_RXRA_CRABP2",
example "Pregnenolone-CYP11A1", "HTR3_complex")
%>% dim() CCDB.complex2gene
[1] 338 6
%>%
CCDB.complex2gene filter(complex_name %in% example) %>%
::kable() knitr
complex_name | symbol_1 | symbol_2 | symbol_3 | symbol_4 | symbol_5 |
---|---|---|---|---|---|
12oxoLTB4-PTGR1 | PTGR1 | ||||
TREM2_TYROBP | TREM2 | TYROBP | |||
RARA_RXRA_CRABP2 | RARA | RXRA | CRABP2 | ||
Pregnenolone-CYP11A1 | CYP11A1 | FDX2 | FDX1 | FDXR | |
HTR3_complex | HTR3A | HTR3B | HTR3C | HTR3D | HTR3E |
We need to map the gene symbols to uniprot IDs to make this database compatible with CellPhoneDB.
<- CellChatDB.human$geneInfo %>%
CCDB.protein2gene select(EntryID.uniprot, Symbol)
%>% dim() CCDB.protein2gene
[1] 26827 2
%>% head() CCDB.protein2gene
# A tibble: 6 × 2
EntryID.uniprot Symbol
<chr> <chr>
1 P04217 A1BG
2 Q9NQ94 A1CF
3 P01023 A2M
4 A8K2U0 A2ML1
5 U3KPV4 A3GALT2
6 Q9NPC4 A4GALT
## Map gene symbol to uniprot
<- CCDB.complex2gene %>%
CCDB.complex2protein pivot_longer(cols = !complex_name) %>%
left_join(CCDB.protein2gene, by = c("value" = "Symbol")) %>%
select(complex_name, name, EntryID.uniprot) %>%
pivot_wider(id_cols = complex_name, names_from = "name", values_from = "EntryID.uniprot") %>%
rename_with( ~ gsub("symbol", "uniprot", .))
%>%
CCDB.complex2protein filter(complex_name %in% example) %>%
::kable() knitr
complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 |
---|---|---|---|---|---|
12oxoLTB4-PTGR1 | Q14914 | ||||
TREM2_TYROBP | Q9NZC2 | O43914 | |||
RARA_RXRA_CRABP2 | P10276 | P19793 | P29373 | ||
Pregnenolone-CYP11A1 | P05108 | Q6P4F2 | P10109 | P22570 | |
HTR3_complex | P46098 | O95264 | Q8WXA8 | Q70Z44 | A5X5Y0 |
Again, we can count the number of proteins per complex:
%>%
CCDB.complex2protein mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
tabyl(nproteins_per_complex) %>%
::kable(align = rep("l", nrow(.))) knitr
nproteins_per_complex | n | percent |
---|---|---|
1 | 50 | 0.1479290 |
2 | 250 | 0.7396450 |
3 | 33 | 0.0976331 |
4 | 4 | 0.0118343 |
5 | 1 | 0.0029586 |
Now, let’s add a key to help with the comparison, using the same logic from CellPhoneDB.
<- CCDB.complex2protein %>%
CCDB.complex2protein rowwise() %>%
mutate(complex_key = paste(
sort(na.omit(c(uniprot_1, uniprot_2, uniprot_3, uniprot_4, uniprot_5))),
collapse = "_")) %>%
ungroup()
%>%
CCDB.complex2protein filter(complex_name %in% example) %>%
::kable() knitr
complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | complex_key |
---|---|---|---|---|---|---|
12oxoLTB4-PTGR1 | Q14914 | Q14914 | ||||
TREM2_TYROBP | Q9NZC2 | O43914 | O43914_Q9NZC2 | |||
RARA_RXRA_CRABP2 | P10276 | P19793 | P29373 | P10276_P19793_P29373 | ||
Pregnenolone-CYP11A1 | P05108 | Q6P4F2 | P10109 | P22570 | P05108_P10109_P22570_Q6P4F2 | |
HTR3_complex | P46098 | O95264 | Q8WXA8 | Q70Z44 | A5X5Y0 | A5X5Y0_O95264_P46098_Q70Z44_Q8WXA8 |
2.4. Duplicate keys
Before we merge, let us pull out the duplicate keys within each database for manual curation.
<- CPDB.complex2protein %>%
CPDB.complex2protein.uniq add_count(complex_key) %>%
filter(n == 1) %>%
select(-n)
<- CPDB.complex2protein %>%
CPDB.complex2protein.dup add_count(complex_key) %>%
filter(n > 1) %>%
select(-n) %>%
arrange(complex_key)
Duplicate complex keys in CellPhoneDB
%>%
CPDB.complex2protein.dup mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
arrange(nproteins_per_complex) %>%
select(-nproteins_per_complex) %>%
::kable() knitr
complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | complex_key |
---|---|---|---|---|---|---|
SerotoninDopamin_byDDC | P20711 | P20711 | ||||
b-PEA_byDDC | P20711 | P20711 | ||||
Desmosterol_byDHCR7 | Q9UBM7 | Q9UBM7 | ||||
Cholesterol_byDHCR7 | Q9UBM7 | Q9UBM7 | ||||
IL28_receptor | Q8IU57 | Q08334 | Q08334_Q8IU57 | |||
Type_III_IFNR | Q08334 | Q8IU57 | Q08334_Q8IU57 | |||
5SHpETE_byALOX5 | P09917 | Q16873 | P20292 | P09917_P20292_Q16873 | ||
LeukotrieneC4_byLTC4S | P20292 | P09917 | Q16873 | P09917_P20292_Q16873 | ||
LipoxinA4_byALOX5 | P09917 | P20292 | Q16873 | P09917_P20292_Q16873 |
<- CCDB.complex2protein %>%
CCDB.complex2protein.uniq add_count(complex_key) %>%
filter(n == 1) %>%
select(-n)
<- CCDB.complex2protein %>%
CCDB.complex2protein.dup add_count(complex_key) %>%
filter(n > 1) %>%
select(-n) %>%
arrange(complex_key)
Duplicate complex keys in CellChat
%>%
CCDB.complex2protein.dup mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
arrange(nproteins_per_complex) %>%
select(-nproteins_per_complex) %>%
::kable() knitr
complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | complex_key |
---|---|---|---|---|---|---|
Cholesterol-DHCR7 | Q9UBM7 | Q9UBM7 | ||||
Desmosterol-DHCR7 | Q9UBM7 | Q9UBM7 | ||||
CALCRL_RAMP1 | P30988 | O60894 | O60894_P30988 | |||
CALCR_RAMP1 | P30988 | O60894 | O60894_P30988 | |||
CALCRL_RAMP2 | P30988 | O60895 | O60895_P30988 | |||
CALCR_RAMP2 | P30988 | O60895 | O60895_P30988 | |||
CALCRL_RAMP3 | P30988 | O60896 | O60896_P30988 | |||
CALCR_RAMP3 | P30988 | O60896 | O60896_P30988 | |||
ITGA1_ITGB1 | P56199 | P05556 | P05556_P56199 | |||
ITGB1_ITGA1 | P05556 | P56199 | P05556_P56199 | |||
KLRD1_KLRC1 | Q13241 | P26715 | P26715_Q13241 | |||
CD94:NKG2A | Q13241 | P26715 | P26715_Q13241 | |||
KLRD1_KLRC2 | Q13241 | P26717 | P26717_Q13241 | |||
CD94:NKG2C | Q13241 | P26717 | P26717_Q13241 | |||
KLRK1_HCST | P26718 | Q9UBK5 | P26718_Q9UBK5 | |||
NKG2D_HCST | P26718 | Q9UBK5 | P26718_Q9UBK5 | |||
LTC4-LTC4S | P20292 | P09917 | Q16873 | P09917_P20292_Q16873 | ||
LXA4-ALOX5 | P09917 | P20292 | Q16873 | P09917_P20292_Q16873 |
Let’s merge the unique records
<- full_join(CCDB.complex2protein.uniq, CPDB.complex2protein.uniq,
combined_unique by = "complex_key",
suffix = c(".CC", ".CP")) %>%
relocate("complex_key")
combined_unique
Save the output as excel file for manual mapping
::write.xlsx(
openxlsxlist( partial = combined_unique,
CCDB_dup = CCDB.complex2protein.dup,
CPDB_dup = CPDB.complex2protein.dup),
file = "matched_partial_tmp.xlsx")
After manually mapping complexes to both databases and assigning them to a unique complex key, the mapped file contains six sheets listed as such:
excel_sheets("matched_partial.xlsx")
[1] "partial" "CCDB_dup" "CPDB_dup" "partial match"
[5] "mapped" "clean"
How duplicate keys were handled
CellPhoneDB
<- read_excel("matched_partial.xlsx", sheet = "CPDB_dup")
CPDB.dup
%>%
CPDB.dup mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
arrange(nproteins_per_complex) %>%
select(-nproteins_per_complex) %>%
::kable() knitr
complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | complex_key | proposed | match found |
---|---|---|---|---|---|---|---|---|
b-PEA_byDDC | P20711 | P20711 | no | |||||
SerotoninDopamin_byDDC | P20711 | P20711 | yes | |||||
Desmosterol_byDHCR7 | Q9UBM7 | Q9UBM7 | yes | |||||
Cholesterol_byDHCR7 | Q9UBM7 | Q9UBM7 | yes | |||||
IL28_receptor | Q8IU57 | Q08334 | Q08334_Q8IU57 | yes | ||||
Type_III_IFNR | Q08334 | Q8IU57 | Q08334_Q8IU57 | Q8IU57_Q08334 | yes | |||
5SHpETE_byALOX5 | P09917 | Q16873 | P20292 | P09917_P20292_Q16873 | P09917_Q16873_P20292 | no | ||
LeukotrieneC4_byLTC4S | P20292 | P09917 | Q16873 | P09917_P20292_Q16873 | P20292_P09917_Q16873 | yes | ||
LipoxinA4_byALOX5 | P09917 | P20292 | Q16873 | P09917_P20292_Q16873 | P09917_P20292_Q16873 | yes |
CellChat
<- read_excel("matched_partial.xlsx", sheet = "CCDB_dup")
CCDB.dup
%>%
CCDB.dup mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
arrange(nproteins_per_complex) %>%
select(-nproteins_per_complex) %>%
::kable() knitr
complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | complex_key | proposed | match found |
---|---|---|---|---|---|---|---|---|
Cholesterol-DHCR7 | Q9UBM7 | Q9UBM7 | yes | |||||
Desmosterol-DHCR7 | Q9UBM7 | Q9UBM7 | yes | |||||
KLRD1_KLRC1 | Q13241 | P26715 | P26715_Q13241 | Unused in CellChat | ||||
KLRD1_KLRC2 | Q13241 | P26717 | P26717_Q13241 | Unused in CellChat | ||||
KLRK1_HCST | P26718 | Q9UBK5 | P26718_Q9UBK5 | Unused in CellChat | ||||
CALCRL_RAMP1 | P30988 | O60894 | O60894_P30988 | O60894_Q16602 | yes | |||
CALCR_RAMP1 | P30988 | O60894 | O60894_P30988 | yes | ||||
CALCRL_RAMP2 | P30988 | O60895 | O60895_P30988 | O60895_Q16602 | yes | |||
CALCR_RAMP2 | P30988 | O60895 | O60895_P30988 | yes | ||||
CALCRL_RAMP3 | P30988 | O60896 | O60896_P30988 | O60896_Q16602 | yes | |||
CALCR_RAMP3 | P30988 | O60896 | O60896_P30988 | yes | ||||
ITGA1_ITGB1 | P56199 | P05556 | P05556_P56199 | yes | ||||
ITGB1_ITGA1 | P05556 | P56199 | P05556_P56199 | yes | ||||
CD94:NKG2A | Q13241 | P26715 | P26715_Q13241 | yes | ||||
CD94:NKG2C | Q13241 | P26717 | P26717_Q13241 | yes | ||||
NKG2D_HCST | P26718 | Q9UBK5 | P26718_Q9UBK5 | yes | ||||
LTC4-LTC4S | P20292 | P09917 | Q16873 | P09917_P20292_Q16873 | P20292_P09917_Q16873 | yes | ||
LXA4-ALOX5 | P09917 | P20292 | Q16873 | P09917_P20292_Q16873 | P09917_P20292_Q16873 | yes |
We include a proposed complex key to differentiate complexes composed of the same subunits based on:
(1) functional context the complex operates (e.g. same complex producing different non-protein compounds)
(2) false annotation of uniprot ID to its corresponding gene (e.g. P30988 (CALCR) replaced to Q16602 (CALCRL) for CALCRL-containing complexes).
Load the complete curated file
<- read_excel("matched_partial.xlsx", sheet = "clean")
complex
%>% dim() complex
[1] 394 10
Mapped complexes file with the above example
complexes for illustration
Each complex in CellChat and CellPhoneDB is now assigned a unique complex key
%>%
complex filter(CellChat_name %in% example) %>%
mutate(CellChat_name = factor(CellChat_name, levels = example)) %>%
arrange(CellChat_name) %>%
::kable() knitr
complex_key | CellChat_name | CellPhoneDB_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | match | Note |
---|---|---|---|---|---|---|---|---|---|
Q14914 | 12oxoLTB4-PTGR1 | 12oxoLeukotrieneB4_byPTGR1 | Q14914 | exact | |||||
O43914_Q9NZC2 | TREM2_TYROBP | TREM2_receptor | Q9NZC2 | O43914 | exact | ||||
P10276_P19793_P29373 | RARA_RXRA_CRABP2 | RAreceptor_RARA_RXRA | P10276 | P19793 | P29373 | exact | |||
P05108_P10109_P22570_Q6P4F2 | Pregnenolone-CYP11A1 | Pregnenolone_byCYP11A1 | P05108 | Q6P4F2 | P10109 | P22570 | exact | ||
A5X5Y0_O95264_P46098_Q70Z44_Q8WXA8 | HTR3_complex | 5HTR3_complex | P46098 | O95264 | Q8WXA8 | Q70Z44 | A5X5Y0 | exact |
Breakdown of complexes in the final list
%>%
complex tabyl(match) %>%
arrange(desc(n))
match n percent
exact 296 0.75126904
Found only in CellPhoneDB 55 0.13959391
Found only in CellChat 31 0.07868020
manual 8 0.02030457
partial match 4 0.01015228
Following manual curation of the complexes and mapping based on uniprot IDs, the break down of the matching process is as shown above. We display the manually-mapped complexes, and partial match
complexes which we define as sharing at least two subunits.
%>%
complex filter(match %in% c("manual", "partial match")) %>%
mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
arrange(nproteins_per_complex) %>%
select(-nproteins_per_complex)
Click on the rows to view information in the Note column.
Full complex mapping table
complex
3. Gene Mapping
We map genes in both databases to their respective uniprot IDs for consistency.
3.1. Mapping CellPhoneDB genes to Uniprot IDs
Preprocessing data
rm(list = ls())
options(knitr.kable.NA = NA) # allow NA in kable from this section
<- read_delim("data/uniprot_synonyms.tsv") %>%
CPDB.name2uniprot clean_names() %>%
select(entry_name, entry)
%>% dim() CPDB.name2uniprot
[1] 207304 2
%>%
CPDB.name2uniprot head()
# A tibble: 6 × 2
entry_name entry
<chr> <chr>
1 A0A024QZ08_HUMAN A0A024QZ08
2 A0A024QZ26_HUMAN A0A024QZ26
3 A0A024QZ86_HUMAN A0A024QZ86
4 A0A024QZA8_HUMAN A0A024QZA8
5 A0A024QZB8_HUMAN A0A024QZB8
6 A0A024QZQ1_HUMAN A0A024QZQ1
<- read_csv("data/gene_table.csv") %>%
CPDB.gene2name arrange(desc(ensembl)) %>%
distinct(protein_id, .keep_all = T)
%>% dim() CPDB.gene2name
[1] 1355 6
%>%
CPDB.gene2name head() %>%
::kable(align = rep("l", nrow(.))) knitr
id_gene | ensembl | gene_name | hgnc_symbol | protein_name | protein_id |
---|---|---|---|---|---|
472 | ENSG00000286070 | GGT1 | GGT1 | GGT1_HUMAN | 396 |
989 | ENSG00000285456 | LTB4R | LTB4R | LT4R1_HUMAN | 782 |
1240 | ENSG00000285203 | LTB4R2 | LTB4R2 | LT4R2_HUMAN | 984 |
1071 | ENSG00000285070 | ADIPOR2 | ADIPOR2 | ADR2_HUMAN | 843 |
500 | ENSG00000284816 | EPHA1 | EPHA1 | EPHA1_HUMAN | 419 |
715 | ENSG00000284510 | KIR2DL3 | KIR2DL3 | KI2L3_HUMAN | 592 |
We want to map protein_name in CPDB.gene2name
to entry (uniprot ID) using CPDB.name2uniprot
.
Addressing invalid data points (i.e. non-uniprot protein name, missing entries)
Some proteins did not have a valid uniprot name (no “_HUMAN” in the protein name). Therefore, we manually added the uniprot name with reference from https://www.uniprot.org/.
$protein_name[!grepl("HUMAN$", CPDB.gene2name$protein_name)] CPDB.gene2name
[1] "CCL3L1" "HLAC" "HLAA" "HLAB" "IFNA1"
# fill in missing entry names
<- CPDB.gene2name %>%
CPDB.gene2name mutate(protein_name = case_when(
%in% c("HLAA", "HLAB", "HLAC") ~ paste0(protein_name, "_HUMAN"),
protein_name == "CCL3L1" ~ "CL3L1_HUMAN",
protein_name == "IFNA1" ~ "L0N195_HUMAN",
protein_name .default = protein_name
%>%
))
# mapping from uniprot protein name to the corresponding uniprot IDs
left_join(CPDB.name2uniprot %>%
select(entry, entry_name), by = c("protein_name" = "entry_name"))
%>% dim() CPDB.gene2name
[1] 1355 7
%>%
CPDB.gene2name head() %>%
::kable(align = rep("l", nrow(.))) knitr
id_gene | ensembl | gene_name | hgnc_symbol | protein_name | protein_id | entry |
---|---|---|---|---|---|---|
472 | ENSG00000286070 | GGT1 | GGT1 | GGT1_HUMAN | 396 | P19440 |
989 | ENSG00000285456 | LTB4R | LTB4R | LT4R1_HUMAN | 782 | Q15722 |
1240 | ENSG00000285203 | LTB4R2 | LTB4R2 | LT4R2_HUMAN | 984 | Q9NPC1 |
1071 | ENSG00000285070 | ADIPOR2 | ADIPOR2 | ADR2_HUMAN | 843 | NA |
500 | ENSG00000284816 | EPHA1 | EPHA1 | EPHA1_HUMAN | 419 | P21709 |
715 | ENSG00000284510 | KIR2DL3 | KIR2DL3 | KI2L3_HUMAN | 592 | P43628 |
rm(CPDB.name2uniprot)
Using biomaRt database to map from Ensembl ID to uniprot ID
For genes without a successful map, we used the biomaRt database to map the genes to their corresponding uniprot IDs via an Ensembl-to-Uniprot mapping.
<- CPDB.gene2name$ensembl[is.na(CPDB.gene2name$entry)]
na.genes na.genes
[1] "ENSG00000285070" "ENSG00000283448" "ENSG00000203722" "ENSG00000177707"
[5] "ENSG00000164520" "ENSG00000159346" "ENSG00000155918" "ENSG00000143217"
[9] "ENSG00000131019" "ENSG00000131015" "ENSG00000130202" "ENSG00000123146"
[13] "ENSG00000111981" "ENSG00000110400" "ENSG00000101000"
# Load biomaRt database
<- useMart('ENSEMBL_MART_ENSEMBL')
mart <- useDataset('hsapiens_gene_ensembl', mart)
mart
# listAttributes(mart) %>% view()
<- getBM(
annotLookup mart = mart,
attributes = c(
'hgnc_symbol',
'ensembl_gene_id',
'uniprotswissprot'),
uniqueRows = TRUE) %>%
arrange(desc(uniprotswissprot)) %>%
distinct(ensembl_gene_id, .keep_all = T) %>%
filter(uniprotswissprot != "")
%>% dim() annotLookup
[1] 21801 3
%>% head() annotLookup
hgnc_symbol ensembl_gene_id uniprotswissprot
1 SYT15B ENSG00000288175 X6R8R1
2 SYT15B ENSG00000277758 X6R8R1
3 CIMIP3 ENSG00000287363 X6R8D5
4 PYDC5 ENSG00000289721 W6CW81
5 SPACA6 ENSG00000182310 W5XKT8
6 A3GALT2 ENSG00000184389 U3KPV4
<- CPDB.gene2name %>%
CPDB.gene2name left_join(annotLookup %>%
select(-hgnc_symbol), by = c("ensembl" = "ensembl_gene_id")) %>%
mutate(entry = ifelse(ensembl %in% na.genes, uniprotswissprot, entry),
agree = ifelse(entry == uniprotswissprot, T, F))
%>% dim() CPDB.gene2name
[1] 1355 9
%>%
CPDB.gene2name head() %>%
::kable(align = rep("l", nrow(.))) knitr
id_gene | ensembl | gene_name | hgnc_symbol | protein_name | protein_id | entry | uniprotswissprot | agree |
---|---|---|---|---|---|---|---|---|
472 | ENSG00000286070 | GGT1 | GGT1 | GGT1_HUMAN | 396 | P19440 | NA | NA |
989 | ENSG00000285456 | LTB4R | LTB4R | LT4R1_HUMAN | 782 | Q15722 | Q15722 | TRUE |
1240 | ENSG00000285203 | LTB4R2 | LTB4R2 | LT4R2_HUMAN | 984 | Q9NPC1 | Q9NPC1 | TRUE |
1071 | ENSG00000285070 | ADIPOR2 | ADIPOR2 | ADR2_HUMAN | 843 | Q86V24 | Q86V24 | TRUE |
500 | ENSG00000284816 | EPHA1 | EPHA1 | EPHA1_HUMAN | 419 | P21709 | P21709 | TRUE |
715 | ENSG00000284510 | KIR2DL3 | KIR2DL3 | KI2L3_HUMAN | 592 | P43628 | P43628 | TRUE |
rm(na.genes, mart, annotLookup)
Manual check for genes that do not match
%>%
CPDB.gene2name tabyl(agree) %>%
::kable() knitr
agree | n | percent | valid_percent |
---|---|---|---|
FALSE | 7 | 0.0051661 | 0.0052045 |
TRUE | 1338 | 0.9874539 | 0.9947955 |
NA | 10 | 0.0073801 | NA |
<- na.omit(CPDB.gene2name$ensembl[CPDB.gene2name$agree == F])
false.genes
%>%
CPDB.gene2name filter(ensembl %in% false.genes) %>%
::kable(align = rep("l", nrow(.))) knitr
id_gene | ensembl | gene_name | hgnc_symbol | protein_name | protein_id | entry | uniprotswissprot | agree |
---|---|---|---|---|---|---|---|---|
29 | ENSG00000197919 | IFNA1 | IFNA1 | L0N195_HUMAN | 182 | L0N195 | P01562 | FALSE |
1579 | ENSG00000179915 | NRXN1 | NRXN1 | NRX1B_HUMAN | 1275 | P58400 | Q9ULB1 | FALSE |
1742 | ENSG00000179915 | NRXN1 | NRXN1 | NRX1A_HUMAN | 1274 | Q9ULB1 | Q9ULB1 | TRUE |
1759 | ENSG00000171867 | PRNP | PRNP | APRIO_HUMAN | 1173 | F7VJQ1 | P04156 | FALSE |
185 | ENSG00000110680 | CALCA | CALCA | CALC_HUMAN | 171 | P01258 | P06881 | FALSE |
1580 | ENSG00000110076 | NRXN2 | NRXN2 | NRX2B_HUMAN | 1276 | P58401 | Q9P2S2 | FALSE |
37 | ENSG00000101307 | SIRPB1 | SIRPB1 | SIRB1_HUMAN | 45 | O00241 | Q5TFQ8 | FALSE |
1705 | ENSG00000021645 | NRXN3 | NRXN3 | NRX3B_HUMAN | 1277 | Q9HDB5 | Q9Y4C0 | FALSE |
# Manual check
# ENSG00000197919 use uniprotswissprot
# ENSG00000179915, ENSG00000171867, ENSG00000110680, ENSG00000110076, ENSG00000101307, ENSG00000021645 unchanged
<- CPDB.gene2name %>%
CPDB.gene2uniprot mutate(entry = ifelse(ensembl == "ENSG00000197919", uniprotswissprot, entry)) %>%
rename(uniprot_id = entry) %>%
select(-uniprotswissprot, - agree)
write_csv(CPDB.gene2uniprot, file = "results/gene_cellphonedb.csv")
%>% dim() CPDB.gene2uniprot
[1] 1355 7
Full CellPhoneDB gene mapping table
CPDB.gene2uniprot
rm(false.genes, CPDB.gene2name, CPDB.gene2uniprot)
3.2. Mapping CellChat genes to Uniprot IDs
<- CellChatDB.human$geneInfo %>%
CCDB.gene2uniprot clean_names()
<- CCDB.gene2uniprot$symbol[(is.na(CCDB.gene2uniprot$entry_id_uniprot))]
na.genes
na.genes
[1] "CCL3L1" "CCL3L3" "GPR1"
<- CCDB.gene2uniprot %>%
CCDB.gene2uniprot mutate(entry_id_uniprot = case_when(
%in% "CCL3L1" ~ "P16619",
symbol == "CCL3L3" ~ "P16619_L3", # CCL3L1 & CCL3L3 mapped to same gene; differentiate with "_L3" for CCL3L3!
symbol == "GPR1" ~ "P46091",
symbol .default = entry_id_uniprot
))
%>% dim() CCDB.gene2uniprot
[1] 26827 9
Full CellChat gene mapping table
CCDB.gene2uniprot
Click on the rows if you wish to extract full information on protein_name, keywords and location.