#include #include #include #include #include #include #include #include #include #include #include #include #include "Array" #include "Core" #include "Dense" using namespace Eigen; using namespace std; /* Contains the source code for haploPS alpha version. //csvline_populate is used to parse input with a delimiter. // required library: Eigen. If you want to compile the source code, please install Eigen. The compilation command using g++ is as follows: g++ -L ~/library_path/Eigen/ -Wall HaploPS_version2.cpp Any question or command, please email: xuanyao.liu@nus.edu.sg */ //function 1, parsing###################################################################################3 //void _csvline_populate(vector &record, const string& line, char delimiter); //////function parsing delimited files////////////////// //////function parsing delimited files////////////////// void _csvline_populate(vector &record, const string& line, char delimiter) { int linepos=0; int inquotes=false; char c; int linemax=line.length(); string curstring; record.clear(); while(line[linepos]!=0 && linepos < linemax) { c = line[linepos]; if (!inquotes && curstring.length()==0 && c=='"') { //beginquotechar inquotes=true; } else if (inquotes && c=='"') { //quotechar if ( (linepos+1 bool operator()(const Pair& a, const Pair& b) { return a.second < b.second; } }; template typename std::map::value_type, int>::value_type most_frequent_element(Fwd begin, Fwd end) { std::map::value_type, int> count; for (Fwd it = begin; it != end; ++it) ++count[*it]; return *std::max_element(count.begin(), count.end(), by_second()); } ////////////////////////////end of maxcount//////////////////////////////////////// ////////////////////////main program/////////////////////////////////////////////////// int main(int argc,char *argv[]){ clock_t t1,t2; t1=clock(); string inputfile;//input genotype file string legendfile; float freq; string outputfile; assert(argc ==9); inputfile=argv[2]; //genotype file legendfile=argv[4]; //legend file freq=atof(argv[6]); //search haplotype frequency outputfile=argv[8]; ofstream outf; outf.open(outputfile.c_str()); outf<<"chr"<<"\t"<<"startpos"<<"\t"<<"endpos"<<"\t"<<"genetic_dist"<<"\t"<<"physical_dist"<<"\t"<<"n.snp"< row; string line; string header;//header of table int i; int j; //int m; //////read in haplotype file/////////////// ifstream in(inputfile.c_str()); if (in.fail()) { cout << "Input genotype file not found "<crithap){ //make extras into string & find the highest occurrence and the rownumber & canonical haplotype string haplo; string canonicalhap; int canonicalhaprow; vector hapblock; hapblock.clear(); int haplo_start=max(posflag-extra,0);//rows in c++ int haplo_end=min(posflag+extra,snptotal-1); int ncolmatrix=haplo_end-haplo_start+1; MatrixXi hapblock_matrix(numchr,ncolmatrix); ////read every line of the haplotype file, & number of rows/////////// rowcount=0; while(rowcount x = most_frequent_element(hapblock.begin(), hapblock.end()); canonicalhap=x.first; canonicalhaprow=std::find(hapblock.begin(),hapblock.end(),canonicalhap)-hapblock.begin(); //calculate distances, and get frequency simcount=0; for(i=0;ithres) {simcount++;} } extra=extra+10; }//end of left right extra //iterates /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //extend haplotype to the right at 10 snps extra=extra-20; int positiveflag=1; while(positiveflag>0){ string haplo; string canonicalhap; int canonicalhaprow; vector hapblock; hapblock.clear(); int haplo_start=max(posflag-extra,0);//rows in c++ int haplo_end=min(posflag+extra+10*positiveflag,snptotal-1); if(haplo_end-haplo_start+1>1){ int ncolmatrix=haplo_end-haplo_start+1; MatrixXi hapblock_matrix(numchr,ncolmatrix); rowcount=0; while(rowcount x = most_frequent_element(hapblock.begin(), hapblock.end()); canonicalhap=x.first; canonicalhaprow=std::find(hapblock.begin(),hapblock.end(),canonicalhap)-hapblock.begin(); //calculate distances, and get frequency simcount=0; for(i=0;ithres) {simcount++;} } if(simcount>crithap) positiveflag++; if(simcount<=crithap || haplo_end==snptotal-1) { extraright=extra+10*(positiveflag-1); positiveflag=-1; } }//end of if nsnp>! if(haplo_end-haplo_start+1==1){ positiveflag=-1; extraright=extra; } }//while positive flag ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //extra to the left at 10 snps int negativeflag=1; while(negativeflag>0){ string haplo; string canonicalhap; int canonicalhaprow; vector hapblock; hapblock.clear(); int haplo_start=max(posflag-extra-10*negativeflag,0);//rows in c++ int haplo_end=min(posflag+extraright,snptotal-1); if(haplo_end-haplo_start+1>1){ int ncolmatrix=haplo_end-haplo_start+1; MatrixXi hapblock_matrix(numchr,ncolmatrix); ////read every line of the genotype file, & number of rows/////////// rowcount=0; while(rowcount x = most_frequent_element(hapblock.begin(), hapblock.end()); canonicalhap=x.first; canonicalhaprow=std::find(hapblock.begin(),hapblock.end(),canonicalhap)-hapblock.begin(); //calculate distances, and get frequency simcount=0; for(i=0;ithres) {simcount++;} } if(simcount>crithap) negativeflag++; if(simcount<=crithap || haplo_start==0) { extraleft=extra+10*(negativeflag-1); negativeflag=-1; } }//end of nsnp if(haplo_end-haplo_start+1==1){ negativeflag=-1; extraleft=extra; } }//while negative flag ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// //extra to the right at 1 snp positiveflag=1; while(positiveflag>0){ string haplo; string canonicalhap; int canonicalhaprow; vector hapblock; hapblock.clear(); int haplo_start=max(posflag-extraleft,0);//rows in c++ int haplo_end=min(posflag+extraright+positiveflag,snptotal-1); if(haplo_end-haplo_start+1>1){ int ncolmatrix=haplo_end-haplo_start+1; MatrixXi hapblock_matrix(numchr,ncolmatrix); ////read every line of the genotype file, & number of rows/////////// rowcount=0; while(rowcount x = most_frequent_element(hapblock.begin(), hapblock.end()); canonicalhap=x.first; canonicalhaprow=std::find(hapblock.begin(),hapblock.end(),canonicalhap)-hapblock.begin(); //cout<thres) {simcount++;} } if(simcount>crithap) positiveflag++; if(simcount<=crithap || haplo_end==snptotal-1) { extraright=extraright+(positiveflag-1); positiveflag=-1; } }//end of if nsnp>! if(haplo_end-haplo_start+1==1){ positiveflag=-1; extraright=extraright; } }//while positive flag //////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////// //extra to the left at 1 snp negativeflag=1; while(negativeflag>0){ string haplo; string canonicalhap; int canonicalhaprow; vector hapblock; hapblock.clear(); int haplo_start=max(posflag-extraleft-negativeflag,0);//rows in c++ int haplo_end=min(posflag+extraright,snptotal-1); if(haplo_end-haplo_start+1>1){ int ncolmatrix=haplo_end-haplo_start+1; MatrixXi hapblock_matrix(numchr,ncolmatrix); //cout< x = most_frequent_element(hapblock.begin(), hapblock.end()); canonicalhap=x.first; canonicalhaprow=std::find(hapblock.begin(),hapblock.end(),canonicalhap)-hapblock.begin(); //cout<thres) {simcount++;} } if(simcount>crithap) negativeflag++; if(simcount<=crithap || haplo_start==0) { extraleft=extraleft+(negativeflag-1); negativeflag=-1; } }//end of nsnp if(haplo_end-haplo_start+1==1){ negativeflag=-1; extraleft=extraleft; } }//while negative flag //////////////////////////////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////////////////////////// int hapfinal_start=max(0,posflag-extraleft); int hapfinal_end=min(max(min(snptotal, posflag + extraright), posflag + 1), snptotal); rowcount=0; int startpos=0; int endpos=0; float genetic_start=0; float genetic_end=0; ///read in legend fiile ifstream in3(legendfile.c_str()); if (in3.fail()) { cout << "Input legend file not found: "<