#required library: APcluster algorithm
library("apcluster")


#A function to calculate p-value of fiSher combination; a - a vector of p-values 
fisher_p<-function(a) 1-pchisq(-2*sum(log(a)),df=2*length(a))


#Similarity function: v1, v2 - 2 numeric or character vectors of variant identifiers

simf<-function(v1,v2){

   nom<-length(intersect(v1,v2))
   denom<-length(union(v1,v2))
   nom/denom

}


# AP cluster function:
# p - a vector of ethnic group p-values.
# snp_id_list is a list of SNP identifiers (e.g. SNP ids, or any other values which uniquely define variants withit a region).
# simf - similarity function which takes as an input two numeric or character vectors of variants identifiers and outputs a numeric similarity.


APcluster_meta<-function(p,snp_id_list,simf){

  #Check input
  
  if(!(is.numeric(p))) stop('p should be numeric vector')
  for(i in 1:length(snp_id_list)) if(!(is.numeric(snp_id_list[[i]]) || is.character(snp_id_list[[i]]))) stop('elements of snp_id_list should be either numeric or character vectors')
  if(length(p)!=length(snp_id_list)) stop('Length of p and snp_id_list should be the same')

  #Building similarity matrix s ()

  npop<-length(snp_id_list)
  s<-matrix(0,npop,npop)
  for(i in 1:npop) for(j in 1:npop) s[i,j]<-simf(snp_id_list[[i]],snp_id_list[[j]])

  #Clustering

  b<-apcluster(s)
  cl<-b@clusters

  #If algorithm has not converged, assume one cluster

  if(length(cl)==0) cl[[1]]<-1:length(pop)
  
  #P-value 
  p1<-c()
  for(i in 1:length(cl)) p1<-c(p1,fisher_p(p[cl[[i]]])) 
  fisher_p(p1)

 }






##############################################################################################################################
#EXAMPLE DATA ANALYSIS: THE WORKING DIRECTORY IS IN THE EXAMPLE DATA FOLDER
##############################################################################################################################

#required library for SKAT test
library("SKAT")

#A FUNCTION TO CALCULATE SKAT P-VALUE FOR A SINGLE ETHNIC GROUP, geno - matrix of genotype, obj - output of SKAT_Null_Model function
skatf<-function(geno,obj) SKAT(geno,obj)$p.value

#A MATRIX FOR MAF WITHIN EACH ETHNIC GROUP
maf_f<-c()

#pskat - VECTOR OF SKAT P-VALUES
pskat<-c()



#READING DATA, CALCULATING SKAT P-VALUES, AND CALCULATING MAF FOR EACH ETHNIC GROUP

pop<-c("E1","E2","E3","E4","A1","A2","A3","A4","Af1","Af2","Af3","Af4")

 for(p in pop){

  geno<-matrix(scan(paste(p,"_example.txt",sep=""),what=numeric()),byrow=TRUE,nrow=1000)

  #PHENOTYPE IS IN THE FIRST COLUMN
  phreal<-geno[,1]
  geno<-geno[,-1]

  #CLACULATE MAF FOR
  maf<-apply(geno,2,sum)/dim(geno)[1]/2
  maf_f<-rbind(maf_f,maf)

  #DELETE UNOBSERVED VARIANTS FOR AN ETHNIC GROUP BEFORE SKAT ANALYSIS
  maf<-apply(geno,2,sum)/2/dim(geno)[1]
  out<-(1:length(maf))[maf>0.01 | maf==0]
  if(length(out)>0) geno<-geno[,-out]

  #SKAT P-VALUE
  obj<-SKAT_Null_Model(phreal~1,data=data.frame(phreal=phreal),out_type="C")
  pskat<-c(pskat,skatf(geno,obj))

}



#CREATE snp_id_list FROM A MATRIX OF MAF
#NOTE: IN THIS CASE VARIANT IDENTIFIER IS JUST A NUMBER OF A VARIANT, FIRST BEING THE FIRST VARIANT IN GENOTYPE MATRICES.
#nsnps - NUMBER OF VARIANTS OBSERVED ACROSS ALL THE 12 ETHNIC GROUPS

snp_id_list<-list()
nsnps<-dim(maf_f)[2]
for(i in 1:length(pop)) snp_id_list[[i]]<-(1:nsnps)[maf_f[i,]>0]


#apcluster meta-analysis
APcluster_meta(pskat,snp_id_list,simf)


#OUTPUT: 0.2239268