Haploidizing diploid data

vcfR documentation

by
Brian J. Knaus and Niklaus J. Grünwald

Load example data set and extract the genotypes.

library(vcfR)
data(vcfR_test)
head(vcfR_test)

## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.3"
## [1] "##fileDate=20090805"
## [1] "##source=myImputationProgramV3.1"
## [1] "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta"
## [1] "##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d [Truncated]"
## [1] "##phasing=partial"
## [1] "First 6 rows."
## [1] 
## [1] "***** Fixed section *****"
##      CHROM POS       ID          REF   ALT      QUAL FILTER
## [1,] "20"  "14370"   "rs6054257" "G"   "A"      "29" "PASS"
## [2,] "20"  "17330"   NA          "T"   "A"      "3"  "q10" 
## [3,] "20"  "1110696" "rs6040355" "A"   "G,T"    "67" "PASS"
## [4,] "20"  "1230237" NA          "T"   NA       "47" "PASS"
## [5,] "20"  "1234567" "microsat1" "GTC" "G,GTCT" "50" "PASS"
## [1] 
## [1] "***** Genotype section *****"
##      FORMAT        NA00001          NA00002          NA00003       
## [1,] "GT:GQ:DP:HQ" "0|0:48:1:51,51" "1|0:48:8:51,51" "1/1:43:5:.,."
## [2,] "GT:GQ:DP:HQ" "0|0:49:3:58,50" "0|1:3:5:65,3"   "0/0:41:3"    
## [3,] "GT:GQ:DP:HQ" "1|2:21:6:23,27" "2|1:2:0:18,2"   "2/2:35:4"    
## [4,] "GT:GQ:DP:HQ" "0|0:54:7:56,60" "0|0:48:4:51,51" "0/0:61:2"    
## [5,] "GT:GQ:DP"    "0/1:35:4"       "0/2:17:2"       "1/1:40:3"    
## [1] 
## [1] "Unique GT formats:"
## [1] "GT:GQ:DP:HQ" "GT:GQ:DP"   
## [1]

gt <- extract.gt(vcfR_test)
head(gt)

##            NA00001 NA00002 NA00003
## rs6054257  "0|0"   "1|0"   "1/1"  
## 20_17330   "0|0"   "0|1"   "0/0"  
## rs6040355  "1|2"   "2|1"   "2/2"  
## 20_1230237 "0|0"   "0|0"   "0/0"  
## microsat1  "0/1"   "0/2"   "1/1"

Use is_het() to identify heterozygous positions and censor as NA.

is.na(vcfR_test@gt[,-1][is_het(gt)]) <- TRUE
vcfR_test

## ***** Object of Class vcfR *****
## 3 samples
## 1 CHROMs
## 5 variants
## Object size: 0 Mb
## 40 percent missing data
## *****        *****         *****

Extract the remaining genotypes and use unique() to query for the unique genotypes.

gt <- extract.gt(vcfR_test)
unique(as.vector(gt))

## [1] "0|0" NA    "1/1" "0/0" "2/2"

We can now manually set each genotype to a haploid state. Note that your genotypes will be different!

gt[gt=="0|0"] <- 0
gt[gt=="0/0"] <- 0
gt[gt=="1/1"] <- 1
gt[gt=="2/2"] <- 2
head(gt)

##            NA00001 NA00002 NA00003
## rs6054257  "0"     NA      "1"    
## 20_17330   "0"     NA      "0"    
## rs6040355  NA      NA      "2"    
## 20_1230237 "0"     "0"     "0"    
## microsat1  NA      NA      "1"

Now extract everything but the genotype from the vcfR object, paste our haploid genotypes to it and update the vcfR object.

gt2 <- extract.gt(vcfR_test, extract = FALSE)
head(gt2)

##            NA00001      NA00002      NA00003   
## rs6054257  "48:1:51,51" NA           "43:5:.,."
## 20_17330   "49:3:58,50" NA           "41:3"    
## rs6040355  NA           NA           "35:4"    
## 20_1230237 "54:7:56,60" "48:4:51,51" "61:2"    
## microsat1  NA           NA           "40:3"

gt <- matrix( paste(gt, gt2, sep=":"), nrow=nrow(gt), dimnames=dimnames(gt) )
is.na(gt[gt == "NA:NA"]) <- TRUE
vcfR_test@gt[,-1] <- gt
head(vcfR_test)

## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.3"
## [1] "##fileDate=20090805"
## [1] "##source=myImputationProgramV3.1"
## [1] "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta"
## [1] "##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d [Truncated]"
## [1] "##phasing=partial"
## [1] "First 6 rows."
## [1] 
## [1] "***** Fixed section *****"
##      CHROM POS       ID          REF   ALT      QUAL FILTER
## [1,] "20"  "14370"   "rs6054257" "G"   "A"      "29" "PASS"
## [2,] "20"  "17330"   NA          "T"   "A"      "3"  "q10" 
## [3,] "20"  "1110696" "rs6040355" "A"   "G,T"    "67" "PASS"
## [4,] "20"  "1230237" NA          "T"   NA       "47" "PASS"
## [5,] "20"  "1234567" "microsat1" "GTC" "G,GTCT" "50" "PASS"
## [1] 
## [1] "***** Genotype section *****"
##      FORMAT        NA00001        NA00002        NA00003     
## [1,] "GT:GQ:DP:HQ" "0:48:1:51,51" NA             "1:43:5:.,."
## [2,] "GT:GQ:DP:HQ" "0:49:3:58,50" NA             "0:41:3"    
## [3,] "GT:GQ:DP:HQ" NA             NA             "2:35:4"    
## [4,] "GT:GQ:DP:HQ" "0:54:7:56,60" "0:48:4:51,51" "0:61:2"    
## [5,] "GT:GQ:DP"    NA             NA             "1:40:3"    
## [1] 
## [1] "Unique GT formats:"
## [1] "GT:GQ:DP:HQ" "GT:GQ:DP"   
## [1]

USDA Agricultural Research Service, Horticultural Crops Research Lab.

vcfR documentation

Haploidizing diploid data

Brian J. Knaus

February 7, 2017