Load example data set and extract the genotypes.
library(vcfR)
data(vcfR_test)
head(vcfR_test)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.3"
## [1] "##fileDate=20090805"
## [1] "##source=myImputationProgramV3.1"
## [1] "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta"
## [1] "##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d [Truncated]"
## [1] "##phasing=partial"
## [1] "First 6 rows."
## [1]
## [1] "***** Fixed section *****"
## CHROM POS ID REF ALT QUAL FILTER
## [1,] "20" "14370" "rs6054257" "G" "A" "29" "PASS"
## [2,] "20" "17330" NA "T" "A" "3" "q10"
## [3,] "20" "1110696" "rs6040355" "A" "G,T" "67" "PASS"
## [4,] "20" "1230237" NA "T" NA "47" "PASS"
## [5,] "20" "1234567" "microsat1" "GTC" "G,GTCT" "50" "PASS"
## [1]
## [1] "***** Genotype section *****"
## FORMAT NA00001 NA00002 NA00003
## [1,] "GT:GQ:DP:HQ" "0|0:48:1:51,51" "1|0:48:8:51,51" "1/1:43:5:.,."
## [2,] "GT:GQ:DP:HQ" "0|0:49:3:58,50" "0|1:3:5:65,3" "0/0:41:3"
## [3,] "GT:GQ:DP:HQ" "1|2:21:6:23,27" "2|1:2:0:18,2" "2/2:35:4"
## [4,] "GT:GQ:DP:HQ" "0|0:54:7:56,60" "0|0:48:4:51,51" "0/0:61:2"
## [5,] "GT:GQ:DP" "0/1:35:4" "0/2:17:2" "1/1:40:3"
## [1]
## [1] "Unique GT formats:"
## [1] "GT:GQ:DP:HQ" "GT:GQ:DP"
## [1]
gt <- extract.gt(vcfR_test)
head(gt)
## NA00001 NA00002 NA00003
## rs6054257 "0|0" "1|0" "1/1"
## 20_17330 "0|0" "0|1" "0/0"
## rs6040355 "1|2" "2|1" "2/2"
## 20_1230237 "0|0" "0|0" "0/0"
## microsat1 "0/1" "0/2" "1/1"
Use is_het()
to identify heterozygous positions and
censor as NA.
is.na(vcfR_test@gt[,-1][is_het(gt)]) <- TRUE
vcfR_test
## ***** Object of Class vcfR *****
## 3 samples
## 1 CHROMs
## 5 variants
## Object size: 0 Mb
## 40 percent missing data
## ***** ***** *****
Extract the remaining genotypes and use unique()
to
query for the unique genotypes.
gt <- extract.gt(vcfR_test)
unique(as.vector(gt))
## [1] "0|0" NA "1/1" "0/0" "2/2"
We can now manually set each genotype to a haploid state. Note that your genotypes will be different!
gt[gt=="0|0"] <- 0
gt[gt=="0/0"] <- 0
gt[gt=="1/1"] <- 1
gt[gt=="2/2"] <- 2
head(gt)
## NA00001 NA00002 NA00003
## rs6054257 "0" NA "1"
## 20_17330 "0" NA "0"
## rs6040355 NA NA "2"
## 20_1230237 "0" "0" "0"
## microsat1 NA NA "1"
Now extract everything but the genotype from the vcfR object, paste our haploid genotypes to it and update the vcfR object.
gt2 <- extract.gt(vcfR_test, extract = FALSE)
head(gt2)
## NA00001 NA00002 NA00003
## rs6054257 "48:1:51,51" NA "43:5:.,."
## 20_17330 "49:3:58,50" NA "41:3"
## rs6040355 NA NA "35:4"
## 20_1230237 "54:7:56,60" "48:4:51,51" "61:2"
## microsat1 NA NA "40:3"
gt <- matrix( paste(gt, gt2, sep=":"), nrow=nrow(gt), dimnames=dimnames(gt) )
is.na(gt[gt == "NA:NA"]) <- TRUE
vcfR_test@gt[,-1] <- gt
head(vcfR_test)
## [1] "***** Object of class 'vcfR' *****"
## [1] "***** Meta section *****"
## [1] "##fileformat=VCFv4.3"
## [1] "##fileDate=20090805"
## [1] "##source=myImputationProgramV3.1"
## [1] "##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta"
## [1] "##contig=<ID=20,length=62435964,assembly=B36,md5=f126cdf8a6e0c7f379d [Truncated]"
## [1] "##phasing=partial"
## [1] "First 6 rows."
## [1]
## [1] "***** Fixed section *****"
## CHROM POS ID REF ALT QUAL FILTER
## [1,] "20" "14370" "rs6054257" "G" "A" "29" "PASS"
## [2,] "20" "17330" NA "T" "A" "3" "q10"
## [3,] "20" "1110696" "rs6040355" "A" "G,T" "67" "PASS"
## [4,] "20" "1230237" NA "T" NA "47" "PASS"
## [5,] "20" "1234567" "microsat1" "GTC" "G,GTCT" "50" "PASS"
## [1]
## [1] "***** Genotype section *****"
## FORMAT NA00001 NA00002 NA00003
## [1,] "GT:GQ:DP:HQ" "0:48:1:51,51" NA "1:43:5:.,."
## [2,] "GT:GQ:DP:HQ" "0:49:3:58,50" NA "0:41:3"
## [3,] "GT:GQ:DP:HQ" NA NA "2:35:4"
## [4,] "GT:GQ:DP:HQ" "0:54:7:56,60" "0:48:4:51,51" "0:61:2"
## [5,] "GT:GQ:DP" NA NA "1:40:3"
## [1]
## [1] "Unique GT formats:"
## [1] "GT:GQ:DP:HQ" "GT:GQ:DP"
## [1]
Copyright © 2017, 2018 Brian J. Knaus. All rights reserved.
USDA Agricultural Research Service, Horticultural Crops Research Lab.