pacman::p_load(tidyverse, readxl, writexl, janitor, CellChat, ggvenn, biomaRt,
conflicted, tidytext, DT, kableExtra)
setwd(this.path::here())
rm(list = ls())
conflict_prefer("select", "dplyr")
conflict_prefer("filter", "dplyr")
conflict_prefer("setdiff", "base")
# removes NA from all kable
options(knitr.kable.NA = "") Finding Consensus with CellPhoneDBv5
This document is for outlining the mapping process of interactions generated from cell-cell communication (CCC) analysis - using both CellPhoneDBv5 and CellChatv2.
CellChat is used as the reference CCC tool, and we want to find out which interactions in CellChat are supported by CellPhoneDB. To ensure high confidence and unbiased mapping, we convert all gene symbols to their respective Uniprot IDs in both CCC tools, and do the same for the subunits of complexes.
1. Setup
1.1. R setup
1.2. Folder setup
sapply( c("data", "results"), dir.create, recursive = TRUE )1.3. Download CellPhoneDB data files
These files were downloaded on 11th July 2025.
basepath <- "https://raw.github.com/ventolab/cellphonedb-data/refs/heads/master"
# Main database containing the interaction
download.file( file.path(basepath, "cellphonedb.zip"), "data/cellphonedb.zip" )
unzip("data/cellphonedb.zip", exdir = "data", junkpaths = TRUE)
# Additional files for mapping
download.file( file.path(basepath, "data/complex_input.csv"), "data/complex_input.csv" )
download.file( file.path(basepath, "data/protein_input.csv"), "data/protein_input.csv" )
download.file( file.path(basepath, "data/sources/uniprot_synonyms.tsv"), "data/uniprot_synonyms.tsv" )2. Complex Mapping
2.1. Mapping CellPhoneDB complexes to Uniprot IDs
CPDB.complex2protein <- read_csv("data/complex_input.csv") %>%
select(complex_name, starts_with("uniprot"))
example <- c("12oxoLeukotrieneB4_byPTGR1", "TREM2_receptor", "RAreceptor_RARA_RXRA",
"Pregnenolone_byCYP11A1", "5HTR3_complex")CPDB.complex2protein %>%
filter(complex_name %in% example) %>%
mutate(complex_name = factor(complex_name, levels = example)) %>%
arrange(complex_name) %>%
knitr::kable()| complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 |
|---|---|---|---|---|---|
| 12oxoLeukotrieneB4_byPTGR1 | Q14914 | ||||
| TREM2_receptor | Q9NZC2 | O43914 | |||
| RAreceptor_RARA_RXRA | P10276 | P19793 | P29373 | ||
| Pregnenolone_byCYP11A1 | P05108 | Q6P4F2 | P10109 | P22570 | |
| 5HTR3_complex | P46098 | O95264 | Q8WXA8 | Q70Z44 | A5X5Y0 |
CPDB.complex2protein %>%
mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
tabyl(nproteins_per_complex) %>%
knitr::kable(align = rep("l", nrow(.)))| nproteins_per_complex | n | percent |
|---|---|---|
| 1 | 61 | 0.1685083 |
| 2 | 261 | 0.7209945 |
| 3 | 36 | 0.0994475 |
| 4 | 3 | 0.0082873 |
| 5 | 1 | 0.0027624 |
There are 362 complexes and each complex can include up to 5 proteins (but typically 2).
2.2. Add a unique key for matching
To create the complex key, we concatenated the subunits of heteromeric complexes into a vector, removed NAs within, and sorted the subunits before collapsing them into a single character value. We found that sorting eases the mapping process computationally between CellPhoneDB and CellChat (Before sort: < 250 mapped; After sort: > 290 mapped).
CPDB.complex2protein <- CPDB.complex2protein %>%
rowwise() %>%
mutate(complex_key = paste(
sort(na.omit(c(uniprot_1, uniprot_2, uniprot_3, uniprot_4, uniprot_5))),
collapse = "_")) %>%
ungroup()CPDB.complex2protein %>%
filter(complex_name %in% example) %>%
mutate(complex_name = factor(complex_name, levels = example)) %>%
arrange(complex_name) %>%
knitr::kable()| complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | complex_key |
|---|---|---|---|---|---|---|
| 12oxoLeukotrieneB4_byPTGR1 | Q14914 | Q14914 | ||||
| TREM2_receptor | Q9NZC2 | O43914 | O43914_Q9NZC2 | |||
| RAreceptor_RARA_RXRA | P10276 | P19793 | P29373 | P10276_P19793_P29373 | ||
| Pregnenolone_byCYP11A1 | P05108 | Q6P4F2 | P10109 | P22570 | P05108_P10109_P22570_Q6P4F2 | |
| 5HTR3_complex | P46098 | O95264 | Q8WXA8 | Q70Z44 | A5X5Y0 | A5X5Y0_O95264_P46098_Q70Z44_Q8WXA8 |
2.3. Mapping CellChatDB complexes to Uniprot IDs
CellChat version 2.2.0 stores the database within R under CellChatDB.human.
pacman::p_version("CellChat")[1] '2.2.0'
CCDB.complex2gene <- CellChatDB.human$complex %>%
mutate(complex_name = rownames(.)) %>%
rename_with( ~ gsub("subunit", "symbol", .)) %>%
relocate(complex_name) %>%
remove_rownames()
example <- c("12oxoLTB4-PTGR1", "TREM2_TYROBP", "RARA_RXRA_CRABP2",
"Pregnenolone-CYP11A1", "HTR3_complex")
CCDB.complex2gene %>% dim()[1] 338 6
CCDB.complex2gene %>%
filter(complex_name %in% example) %>%
knitr::kable()| complex_name | symbol_1 | symbol_2 | symbol_3 | symbol_4 | symbol_5 |
|---|---|---|---|---|---|
| 12oxoLTB4-PTGR1 | PTGR1 | ||||
| TREM2_TYROBP | TREM2 | TYROBP | |||
| RARA_RXRA_CRABP2 | RARA | RXRA | CRABP2 | ||
| Pregnenolone-CYP11A1 | CYP11A1 | FDX2 | FDX1 | FDXR | |
| HTR3_complex | HTR3A | HTR3B | HTR3C | HTR3D | HTR3E |
We need to map the gene symbols to uniprot IDs to make this database compatible with CellPhoneDB.
CCDB.protein2gene <- CellChatDB.human$geneInfo %>%
select(EntryID.uniprot, Symbol)
CCDB.protein2gene %>% dim()[1] 26827 2
CCDB.protein2gene %>% head()# A tibble: 6 × 2
EntryID.uniprot Symbol
<chr> <chr>
1 P04217 A1BG
2 Q9NQ94 A1CF
3 P01023 A2M
4 A8K2U0 A2ML1
5 U3KPV4 A3GALT2
6 Q9NPC4 A4GALT
## Map gene symbol to uniprot
CCDB.complex2protein <- CCDB.complex2gene %>%
pivot_longer(cols = !complex_name) %>%
left_join(CCDB.protein2gene, by = c("value" = "Symbol")) %>%
select(complex_name, name, EntryID.uniprot) %>%
pivot_wider(id_cols = complex_name, names_from = "name", values_from = "EntryID.uniprot") %>%
rename_with( ~ gsub("symbol", "uniprot", .))
CCDB.complex2protein %>%
filter(complex_name %in% example) %>%
knitr::kable()| complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 |
|---|---|---|---|---|---|
| 12oxoLTB4-PTGR1 | Q14914 | ||||
| TREM2_TYROBP | Q9NZC2 | O43914 | |||
| RARA_RXRA_CRABP2 | P10276 | P19793 | P29373 | ||
| Pregnenolone-CYP11A1 | P05108 | Q6P4F2 | P10109 | P22570 | |
| HTR3_complex | P46098 | O95264 | Q8WXA8 | Q70Z44 | A5X5Y0 |
Again, we can count the number of proteins per complex:
CCDB.complex2protein %>%
mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
tabyl(nproteins_per_complex) %>%
knitr::kable(align = rep("l", nrow(.)))| nproteins_per_complex | n | percent |
|---|---|---|
| 1 | 50 | 0.1479290 |
| 2 | 250 | 0.7396450 |
| 3 | 33 | 0.0976331 |
| 4 | 4 | 0.0118343 |
| 5 | 1 | 0.0029586 |
Now, let’s add a key to help with the comparison, using the same logic from CellPhoneDB.
CCDB.complex2protein <- CCDB.complex2protein %>%
rowwise() %>%
mutate(complex_key = paste(
sort(na.omit(c(uniprot_1, uniprot_2, uniprot_3, uniprot_4, uniprot_5))),
collapse = "_")) %>%
ungroup()CCDB.complex2protein %>%
filter(complex_name %in% example) %>%
knitr::kable()| complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | complex_key |
|---|---|---|---|---|---|---|
| 12oxoLTB4-PTGR1 | Q14914 | Q14914 | ||||
| TREM2_TYROBP | Q9NZC2 | O43914 | O43914_Q9NZC2 | |||
| RARA_RXRA_CRABP2 | P10276 | P19793 | P29373 | P10276_P19793_P29373 | ||
| Pregnenolone-CYP11A1 | P05108 | Q6P4F2 | P10109 | P22570 | P05108_P10109_P22570_Q6P4F2 | |
| HTR3_complex | P46098 | O95264 | Q8WXA8 | Q70Z44 | A5X5Y0 | A5X5Y0_O95264_P46098_Q70Z44_Q8WXA8 |
2.4. Duplicate keys
Before we merge, let us pull out the duplicate keys within each database for manual curation.
CPDB.complex2protein.uniq <- CPDB.complex2protein %>%
add_count(complex_key) %>%
filter(n == 1) %>%
select(-n)
CPDB.complex2protein.dup <- CPDB.complex2protein %>%
add_count(complex_key) %>%
filter(n > 1) %>%
select(-n) %>%
arrange(complex_key)Duplicate complex keys in CellPhoneDB
CPDB.complex2protein.dup %>%
mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
arrange(nproteins_per_complex) %>%
select(-nproteins_per_complex) %>%
knitr::kable()| complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | complex_key |
|---|---|---|---|---|---|---|
| SerotoninDopamin_byDDC | P20711 | P20711 | ||||
| b-PEA_byDDC | P20711 | P20711 | ||||
| Desmosterol_byDHCR7 | Q9UBM7 | Q9UBM7 | ||||
| Cholesterol_byDHCR7 | Q9UBM7 | Q9UBM7 | ||||
| IL28_receptor | Q8IU57 | Q08334 | Q08334_Q8IU57 | |||
| Type_III_IFNR | Q08334 | Q8IU57 | Q08334_Q8IU57 | |||
| 5SHpETE_byALOX5 | P09917 | Q16873 | P20292 | P09917_P20292_Q16873 | ||
| LeukotrieneC4_byLTC4S | P20292 | P09917 | Q16873 | P09917_P20292_Q16873 | ||
| LipoxinA4_byALOX5 | P09917 | P20292 | Q16873 | P09917_P20292_Q16873 |
CCDB.complex2protein.uniq <- CCDB.complex2protein %>%
add_count(complex_key) %>%
filter(n == 1) %>%
select(-n)
CCDB.complex2protein.dup <- CCDB.complex2protein %>%
add_count(complex_key) %>%
filter(n > 1) %>%
select(-n) %>%
arrange(complex_key)Duplicate complex keys in CellChat
CCDB.complex2protein.dup %>%
mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
arrange(nproteins_per_complex) %>%
select(-nproteins_per_complex) %>%
knitr::kable()| complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | complex_key |
|---|---|---|---|---|---|---|
| Cholesterol-DHCR7 | Q9UBM7 | Q9UBM7 | ||||
| Desmosterol-DHCR7 | Q9UBM7 | Q9UBM7 | ||||
| CALCRL_RAMP1 | P30988 | O60894 | O60894_P30988 | |||
| CALCR_RAMP1 | P30988 | O60894 | O60894_P30988 | |||
| CALCRL_RAMP2 | P30988 | O60895 | O60895_P30988 | |||
| CALCR_RAMP2 | P30988 | O60895 | O60895_P30988 | |||
| CALCRL_RAMP3 | P30988 | O60896 | O60896_P30988 | |||
| CALCR_RAMP3 | P30988 | O60896 | O60896_P30988 | |||
| ITGA1_ITGB1 | P56199 | P05556 | P05556_P56199 | |||
| ITGB1_ITGA1 | P05556 | P56199 | P05556_P56199 | |||
| KLRD1_KLRC1 | Q13241 | P26715 | P26715_Q13241 | |||
| CD94:NKG2A | Q13241 | P26715 | P26715_Q13241 | |||
| KLRD1_KLRC2 | Q13241 | P26717 | P26717_Q13241 | |||
| CD94:NKG2C | Q13241 | P26717 | P26717_Q13241 | |||
| KLRK1_HCST | P26718 | Q9UBK5 | P26718_Q9UBK5 | |||
| NKG2D_HCST | P26718 | Q9UBK5 | P26718_Q9UBK5 | |||
| LTC4-LTC4S | P20292 | P09917 | Q16873 | P09917_P20292_Q16873 | ||
| LXA4-ALOX5 | P09917 | P20292 | Q16873 | P09917_P20292_Q16873 |
Let’s merge the unique records
combined_unique <- full_join(CCDB.complex2protein.uniq, CPDB.complex2protein.uniq,
by = "complex_key",
suffix = c(".CC", ".CP")) %>%
relocate("complex_key")combined_unique
Save the output as excel file for manual mapping
openxlsx::write.xlsx(
list( partial = combined_unique,
CCDB_dup = CCDB.complex2protein.dup,
CPDB_dup = CPDB.complex2protein.dup),
file = "matched_partial_tmp.xlsx")After manually mapping complexes to both databases and assigning them to a unique complex key, the mapped file contains six sheets listed as such:
excel_sheets("matched_partial.xlsx")[1] "partial" "CCDB_dup" "CPDB_dup" "partial match"
[5] "mapped" "clean"
How duplicate keys were handled
CellPhoneDB
CPDB.dup <- read_excel("matched_partial.xlsx", sheet = "CPDB_dup")
CPDB.dup %>%
mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
arrange(nproteins_per_complex) %>%
select(-nproteins_per_complex) %>%
knitr::kable()| complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | complex_key | proposed | match found |
|---|---|---|---|---|---|---|---|---|
| b-PEA_byDDC | P20711 | P20711 | no | |||||
| SerotoninDopamin_byDDC | P20711 | P20711 | yes | |||||
| Desmosterol_byDHCR7 | Q9UBM7 | Q9UBM7 | yes | |||||
| Cholesterol_byDHCR7 | Q9UBM7 | Q9UBM7 | yes | |||||
| IL28_receptor | Q8IU57 | Q08334 | Q08334_Q8IU57 | yes | ||||
| Type_III_IFNR | Q08334 | Q8IU57 | Q08334_Q8IU57 | Q8IU57_Q08334 | yes | |||
| 5SHpETE_byALOX5 | P09917 | Q16873 | P20292 | P09917_P20292_Q16873 | P09917_Q16873_P20292 | no | ||
| LeukotrieneC4_byLTC4S | P20292 | P09917 | Q16873 | P09917_P20292_Q16873 | P20292_P09917_Q16873 | yes | ||
| LipoxinA4_byALOX5 | P09917 | P20292 | Q16873 | P09917_P20292_Q16873 | P09917_P20292_Q16873 | yes |
CellChat
CCDB.dup <- read_excel("matched_partial.xlsx", sheet = "CCDB_dup")
CCDB.dup %>%
mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
arrange(nproteins_per_complex) %>%
select(-nproteins_per_complex) %>%
knitr::kable()| complex_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | complex_key | proposed | match found |
|---|---|---|---|---|---|---|---|---|
| Cholesterol-DHCR7 | Q9UBM7 | Q9UBM7 | yes | |||||
| Desmosterol-DHCR7 | Q9UBM7 | Q9UBM7 | yes | |||||
| KLRD1_KLRC1 | Q13241 | P26715 | P26715_Q13241 | Unused in CellChat | ||||
| KLRD1_KLRC2 | Q13241 | P26717 | P26717_Q13241 | Unused in CellChat | ||||
| KLRK1_HCST | P26718 | Q9UBK5 | P26718_Q9UBK5 | Unused in CellChat | ||||
| CALCRL_RAMP1 | P30988 | O60894 | O60894_P30988 | O60894_Q16602 | yes | |||
| CALCR_RAMP1 | P30988 | O60894 | O60894_P30988 | yes | ||||
| CALCRL_RAMP2 | P30988 | O60895 | O60895_P30988 | O60895_Q16602 | yes | |||
| CALCR_RAMP2 | P30988 | O60895 | O60895_P30988 | yes | ||||
| CALCRL_RAMP3 | P30988 | O60896 | O60896_P30988 | O60896_Q16602 | yes | |||
| CALCR_RAMP3 | P30988 | O60896 | O60896_P30988 | yes | ||||
| ITGA1_ITGB1 | P56199 | P05556 | P05556_P56199 | yes | ||||
| ITGB1_ITGA1 | P05556 | P56199 | P05556_P56199 | yes | ||||
| CD94:NKG2A | Q13241 | P26715 | P26715_Q13241 | yes | ||||
| CD94:NKG2C | Q13241 | P26717 | P26717_Q13241 | yes | ||||
| NKG2D_HCST | P26718 | Q9UBK5 | P26718_Q9UBK5 | yes | ||||
| LTC4-LTC4S | P20292 | P09917 | Q16873 | P09917_P20292_Q16873 | P20292_P09917_Q16873 | yes | ||
| LXA4-ALOX5 | P09917 | P20292 | Q16873 | P09917_P20292_Q16873 | P09917_P20292_Q16873 | yes |
We include a proposed complex key to differentiate complexes composed of the same subunits based on:
(1) functional context the complex operates (e.g. same complex producing different non-protein compounds)
(2) false annotation of uniprot ID to its corresponding gene (e.g. P30988 (CALCR) replaced to Q16602 (CALCRL) for CALCRL-containing complexes).
Load the complete curated file
complex <- read_excel("matched_partial.xlsx", sheet = "clean")
complex %>% dim()[1] 394 10
Mapped complexes file with the above example complexes for illustration
Each complex in CellChat and CellPhoneDB is now assigned a unique complex key
complex %>%
filter(CellChat_name %in% example) %>%
mutate(CellChat_name = factor(CellChat_name, levels = example)) %>%
arrange(CellChat_name) %>%
knitr::kable()| complex_key | CellChat_name | CellPhoneDB_name | uniprot_1 | uniprot_2 | uniprot_3 | uniprot_4 | uniprot_5 | match | Note |
|---|---|---|---|---|---|---|---|---|---|
| Q14914 | 12oxoLTB4-PTGR1 | 12oxoLeukotrieneB4_byPTGR1 | Q14914 | exact | |||||
| O43914_Q9NZC2 | TREM2_TYROBP | TREM2_receptor | Q9NZC2 | O43914 | exact | ||||
| P10276_P19793_P29373 | RARA_RXRA_CRABP2 | RAreceptor_RARA_RXRA | P10276 | P19793 | P29373 | exact | |||
| P05108_P10109_P22570_Q6P4F2 | Pregnenolone-CYP11A1 | Pregnenolone_byCYP11A1 | P05108 | Q6P4F2 | P10109 | P22570 | exact | ||
| A5X5Y0_O95264_P46098_Q70Z44_Q8WXA8 | HTR3_complex | 5HTR3_complex | P46098 | O95264 | Q8WXA8 | Q70Z44 | A5X5Y0 | exact |
Breakdown of complexes in the final list
complex %>%
tabyl(match) %>%
arrange(desc(n)) match n percent
exact 296 0.75126904
Found only in CellPhoneDB 55 0.13959391
Found only in CellChat 31 0.07868020
manual 8 0.02030457
partial match 4 0.01015228
Following manual curation of the complexes and mapping based on uniprot IDs, the break down of the matching process is as shown above. We display the manually-mapped complexes, and partial match complexes which we define as sharing at least two subunits.
complex %>%
filter(match %in% c("manual", "partial match")) %>%
mutate(nproteins_per_complex = rowSums(!is.na(across(starts_with("uniprot"))))) %>%
arrange(nproteins_per_complex) %>%
select(-nproteins_per_complex)Click on the rows to view information in the Note column.
Full complex mapping table
complex3. Gene Mapping
We map genes in both databases to their respective uniprot IDs for consistency.
3.1. Mapping CellPhoneDB genes to Uniprot IDs
Preprocessing data
rm(list = ls())
options(knitr.kable.NA = NA) # allow NA in kable from this section
CPDB.name2uniprot <- read_delim("data/uniprot_synonyms.tsv") %>%
clean_names() %>%
select(entry_name, entry)
CPDB.name2uniprot %>% dim()[1] 207304 2
CPDB.name2uniprot %>%
head()# A tibble: 6 × 2
entry_name entry
<chr> <chr>
1 A0A024QZ08_HUMAN A0A024QZ08
2 A0A024QZ26_HUMAN A0A024QZ26
3 A0A024QZ86_HUMAN A0A024QZ86
4 A0A024QZA8_HUMAN A0A024QZA8
5 A0A024QZB8_HUMAN A0A024QZB8
6 A0A024QZQ1_HUMAN A0A024QZQ1
CPDB.gene2name <- read_csv("data/gene_table.csv") %>%
arrange(desc(ensembl)) %>%
distinct(protein_id, .keep_all = T)
CPDB.gene2name %>% dim()[1] 1355 6
CPDB.gene2name %>%
head() %>%
knitr::kable(align = rep("l", nrow(.)))| id_gene | ensembl | gene_name | hgnc_symbol | protein_name | protein_id |
|---|---|---|---|---|---|
| 472 | ENSG00000286070 | GGT1 | GGT1 | GGT1_HUMAN | 396 |
| 989 | ENSG00000285456 | LTB4R | LTB4R | LT4R1_HUMAN | 782 |
| 1240 | ENSG00000285203 | LTB4R2 | LTB4R2 | LT4R2_HUMAN | 984 |
| 1071 | ENSG00000285070 | ADIPOR2 | ADIPOR2 | ADR2_HUMAN | 843 |
| 500 | ENSG00000284816 | EPHA1 | EPHA1 | EPHA1_HUMAN | 419 |
| 715 | ENSG00000284510 | KIR2DL3 | KIR2DL3 | KI2L3_HUMAN | 592 |
We want to map protein_name in CPDB.gene2name to entry (uniprot ID) using CPDB.name2uniprot.
Addressing invalid data points (i.e. non-uniprot protein name, missing entries)
Some proteins did not have a valid uniprot name (no “_HUMAN” in the protein name). Therefore, we manually added the uniprot name with reference from https://www.uniprot.org/.
CPDB.gene2name$protein_name[!grepl("HUMAN$", CPDB.gene2name$protein_name)][1] "CCL3L1" "HLAC" "HLAA" "HLAB" "IFNA1"
# fill in missing entry names
CPDB.gene2name <- CPDB.gene2name %>%
mutate(protein_name = case_when(
protein_name %in% c("HLAA", "HLAB", "HLAC") ~ paste0(protein_name, "_HUMAN"),
protein_name == "CCL3L1" ~ "CL3L1_HUMAN",
protein_name == "IFNA1" ~ "L0N195_HUMAN",
.default = protein_name
)) %>%
# mapping from uniprot protein name to the corresponding uniprot IDs
left_join(CPDB.name2uniprot %>%
select(entry, entry_name), by = c("protein_name" = "entry_name"))
CPDB.gene2name %>% dim()[1] 1355 7
CPDB.gene2name %>%
head() %>%
knitr::kable(align = rep("l", nrow(.)))| id_gene | ensembl | gene_name | hgnc_symbol | protein_name | protein_id | entry |
|---|---|---|---|---|---|---|
| 472 | ENSG00000286070 | GGT1 | GGT1 | GGT1_HUMAN | 396 | P19440 |
| 989 | ENSG00000285456 | LTB4R | LTB4R | LT4R1_HUMAN | 782 | Q15722 |
| 1240 | ENSG00000285203 | LTB4R2 | LTB4R2 | LT4R2_HUMAN | 984 | Q9NPC1 |
| 1071 | ENSG00000285070 | ADIPOR2 | ADIPOR2 | ADR2_HUMAN | 843 | NA |
| 500 | ENSG00000284816 | EPHA1 | EPHA1 | EPHA1_HUMAN | 419 | P21709 |
| 715 | ENSG00000284510 | KIR2DL3 | KIR2DL3 | KI2L3_HUMAN | 592 | P43628 |
rm(CPDB.name2uniprot)Using biomaRt database to map from Ensembl ID to uniprot ID
For genes without a successful map, we used the biomaRt database to map the genes to their corresponding uniprot IDs via an Ensembl-to-Uniprot mapping.
na.genes <- CPDB.gene2name$ensembl[is.na(CPDB.gene2name$entry)]
na.genes [1] "ENSG00000285070" "ENSG00000283448" "ENSG00000203722" "ENSG00000177707"
[5] "ENSG00000164520" "ENSG00000159346" "ENSG00000155918" "ENSG00000143217"
[9] "ENSG00000131019" "ENSG00000131015" "ENSG00000130202" "ENSG00000123146"
[13] "ENSG00000111981" "ENSG00000110400" "ENSG00000101000"
# Load biomaRt database
mart <- useMart('ENSEMBL_MART_ENSEMBL')
mart <- useDataset('hsapiens_gene_ensembl', mart)
# listAttributes(mart) %>% view()
annotLookup <- getBM(
mart = mart,
attributes = c(
'hgnc_symbol',
'ensembl_gene_id',
'uniprotswissprot'),
uniqueRows = TRUE) %>%
arrange(desc(uniprotswissprot)) %>%
distinct(ensembl_gene_id, .keep_all = T) %>%
filter(uniprotswissprot != "")
annotLookup %>% dim()[1] 21801 3
annotLookup %>% head() hgnc_symbol ensembl_gene_id uniprotswissprot
1 SYT15B ENSG00000288175 X6R8R1
2 SYT15B ENSG00000277758 X6R8R1
3 CIMIP3 ENSG00000287363 X6R8D5
4 PYDC5 ENSG00000289721 W6CW81
5 SPACA6 ENSG00000182310 W5XKT8
6 A3GALT2 ENSG00000184389 U3KPV4
CPDB.gene2name <- CPDB.gene2name %>%
left_join(annotLookup %>%
select(-hgnc_symbol), by = c("ensembl" = "ensembl_gene_id")) %>%
mutate(entry = ifelse(ensembl %in% na.genes, uniprotswissprot, entry),
agree = ifelse(entry == uniprotswissprot, T, F))
CPDB.gene2name %>% dim()[1] 1355 9
CPDB.gene2name %>%
head() %>%
knitr::kable(align = rep("l", nrow(.)))| id_gene | ensembl | gene_name | hgnc_symbol | protein_name | protein_id | entry | uniprotswissprot | agree |
|---|---|---|---|---|---|---|---|---|
| 472 | ENSG00000286070 | GGT1 | GGT1 | GGT1_HUMAN | 396 | P19440 | NA | NA |
| 989 | ENSG00000285456 | LTB4R | LTB4R | LT4R1_HUMAN | 782 | Q15722 | Q15722 | TRUE |
| 1240 | ENSG00000285203 | LTB4R2 | LTB4R2 | LT4R2_HUMAN | 984 | Q9NPC1 | Q9NPC1 | TRUE |
| 1071 | ENSG00000285070 | ADIPOR2 | ADIPOR2 | ADR2_HUMAN | 843 | Q86V24 | Q86V24 | TRUE |
| 500 | ENSG00000284816 | EPHA1 | EPHA1 | EPHA1_HUMAN | 419 | P21709 | P21709 | TRUE |
| 715 | ENSG00000284510 | KIR2DL3 | KIR2DL3 | KI2L3_HUMAN | 592 | P43628 | P43628 | TRUE |
rm(na.genes, mart, annotLookup)Manual check for genes that do not match
CPDB.gene2name %>%
tabyl(agree) %>%
knitr::kable()| agree | n | percent | valid_percent |
|---|---|---|---|
| FALSE | 7 | 0.0051661 | 0.0052045 |
| TRUE | 1338 | 0.9874539 | 0.9947955 |
| NA | 10 | 0.0073801 | NA |
false.genes <- na.omit(CPDB.gene2name$ensembl[CPDB.gene2name$agree == F])
CPDB.gene2name %>%
filter(ensembl %in% false.genes) %>%
knitr::kable(align = rep("l", nrow(.)))| id_gene | ensembl | gene_name | hgnc_symbol | protein_name | protein_id | entry | uniprotswissprot | agree |
|---|---|---|---|---|---|---|---|---|
| 29 | ENSG00000197919 | IFNA1 | IFNA1 | L0N195_HUMAN | 182 | L0N195 | P01562 | FALSE |
| 1579 | ENSG00000179915 | NRXN1 | NRXN1 | NRX1B_HUMAN | 1275 | P58400 | Q9ULB1 | FALSE |
| 1742 | ENSG00000179915 | NRXN1 | NRXN1 | NRX1A_HUMAN | 1274 | Q9ULB1 | Q9ULB1 | TRUE |
| 1759 | ENSG00000171867 | PRNP | PRNP | APRIO_HUMAN | 1173 | F7VJQ1 | P04156 | FALSE |
| 185 | ENSG00000110680 | CALCA | CALCA | CALC_HUMAN | 171 | P01258 | P06881 | FALSE |
| 1580 | ENSG00000110076 | NRXN2 | NRXN2 | NRX2B_HUMAN | 1276 | P58401 | Q9P2S2 | FALSE |
| 37 | ENSG00000101307 | SIRPB1 | SIRPB1 | SIRB1_HUMAN | 45 | O00241 | Q5TFQ8 | FALSE |
| 1705 | ENSG00000021645 | NRXN3 | NRXN3 | NRX3B_HUMAN | 1277 | Q9HDB5 | Q9Y4C0 | FALSE |
# Manual check
# ENSG00000197919 use uniprotswissprot
# ENSG00000179915, ENSG00000171867, ENSG00000110680, ENSG00000110076, ENSG00000101307, ENSG00000021645 unchanged
CPDB.gene2uniprot <- CPDB.gene2name %>%
mutate(entry = ifelse(ensembl == "ENSG00000197919", uniprotswissprot, entry)) %>%
rename(uniprot_id = entry) %>%
select(-uniprotswissprot, - agree)
write_csv(CPDB.gene2uniprot, file = "results/gene_cellphonedb.csv")
CPDB.gene2uniprot %>% dim()[1] 1355 7
Full CellPhoneDB gene mapping table
CPDB.gene2uniprotrm(false.genes, CPDB.gene2name, CPDB.gene2uniprot)3.2. Mapping CellChat genes to Uniprot IDs
CCDB.gene2uniprot <- CellChatDB.human$geneInfo %>%
clean_names()
na.genes <- CCDB.gene2uniprot$symbol[(is.na(CCDB.gene2uniprot$entry_id_uniprot))]
na.genes[1] "CCL3L1" "CCL3L3" "GPR1"
CCDB.gene2uniprot <- CCDB.gene2uniprot %>%
mutate(entry_id_uniprot = case_when(
symbol %in% "CCL3L1" ~ "P16619",
symbol == "CCL3L3" ~ "P16619_L3", # CCL3L1 & CCL3L3 mapped to same gene; differentiate with "_L3" for CCL3L3!
symbol == "GPR1" ~ "P46091",
.default = entry_id_uniprot
))
CCDB.gene2uniprot %>% dim()[1] 26827 9
Full CellChat gene mapping table
CCDB.gene2uniprotClick on the rows if you wish to extract full information on protein_name, keywords and location.