C       Last change:  LB   13 Mar 2002   12:01 pm
	program RRforest
c	copyright 1999 by leo Breiman
c	this is free software and can be used for any purpose. 
c	It comes with no guarantee.  
	

	parameter(mdim=12,nsample=506,ntest=1,nthsize=5,
     1	nrnodes=2*(nsample/nthsize)+1,jprint=1,jbt=500,mtry=mdim/3,
     1	imp=0,nimp=imax0(imp*nsample,1),mimp=imax0(imp*mdim,1),
     1  th=.5,impprint=0)
     
c	mdim=number of variables in data set

c	nsample=number of cases

c	number of samples in the test set.  ntest=1 implies no test set

c	nthsize=number of cases in a node below which the tree will not split,
c	setting nthsize=5 generally gives good results.

c	jprint=1 turns on output as trees are being added giving tree number,
c	test set mean-sum-of-squared prediction errors,
c	bagged mean-sum-of-squared prediction errors, and the bagged error 
c	divided by var(y).The bagged error estimates are known to be are 
c 	accurate estimates of the true error values.

c	jbt=number of trees in run.  200-500 gives pretty good results

c	mtry=number of variables to pick to split on at each node.  mdim/3
c		seems to give genrally good performance, but it can be 
c		altered uo or down

c	imp=1 turns on variable importance.  This is computed for the
c	mth variable as the percent rise in the test set mean sum-of-
c	squared errors when the mth variable is randomly permuted.

c	if impprint =1 then th=.5 says print out all variables with 
c	importance greater than .5 as the run is progressing.  
c	To keep from having a lot of output, set th high. 

c	at the end of the run all variable importances >0 will be printed out.

c	for each data set, the number of categoricals in each variable has to be
c	specified.  This is done by filling in the cat(mdim) vector See below.
C	cat(m)=1 says that variable m is numerical.  The program expects to see
c	categoricals specified by integers consecutively upward from 1.  


c	MAIN PROGRAM

	real x(mdim,nsample),y(nsample),yb(nsample),rsnodecost(nrnodes),
     1	rstreecost(0:nrnodes),bestcrit(nrnodes),sd(mdim),wts(nsample),
     1	v(nsample),ut(nsample),xt(nsample),xb(mdim,nsample),
     1	errimp(mimp), ytr(nsample),yptr(nsample),yl(nsample),xa(3*mdim),
     1	avnode(nrnodes),utr(nsample),predimp(nimp,mimp),za(mdim),
     1  tgini(mdim),upper(nrnodes),
     1	ypred(ntest),ytree(ntest),xts(mdim,ntest),yts(ntest)
	
	integer jdex(nsample),treemap(2,nrnodes), nodestatus(nrnodes),
     1	nodepop(nrnodes),npert(nsample),ip(mdim),nterm(0:nrnodes),
     1	nperm(nsample),parent(nrnodes),cat(mdim),nout(nsample),
     1	jin(nsample),isort(nsample),nodestart(nrnodes),ncase(nsample),
     1	nbrterm(nrnodes),jperm(jbt),mbest(nrnodes),incl(mdim)
	
c	initialize rn generator

	do n=1,717
	zz=rand(1)
	end do
	
c	declare number of values for each catgegorical

	do m=1,mdim
	cat(m)=1
	end do
	
	
c	BEGIN MAIN

	
c	read in data==example follows
	
c	open(7,file='data5',status='old')  
c	do k=1,500
c	read(7,*) (x(j,k),j=1,mdim),y(k)
c	end do
c	close(7)
c	if there is a test set
c	open(7,file='data5test',status='old')  
c	do k=1,300
c	read(7,*) (xts(j,k),j=1,mdim),yts(k)
c	end do
c	close(7)


 	open(7,file='boshouse',status='old')
	do k=1,nsample
	read(7,*) y(k),(x(j,k),j=1,mdim)
	end do
	close(7)
	
	
	qverrts=0
	averrb=0
	
		avy=0
		vary=0
		do n=1,nsample
		ntrue=n-1
		vary=vary+ntrue*(y(n)-avy)**2/(ntrue+1)
		avy=(ntrue*avy+y(n))/(ntrue+1)
		end do
		vary=vary/nsample
	
	
	call zervr(yptr,nsample)
	call zerv(nout,nsample)
	astr=0
	asd=0

	
	do jb=1,jbt
        call zerv(jin,nsample)
	do n=1,nsample
	k=int(rand1(1)*nsample)+1
	jin(k)=1
	yb(n)=y(k)
	do m=1,mdim
	xb(m,n)=x(m,k)
	end do
	end do
	
	
	nls=nsample
	
	call buildtree(xb,yb,yl,mdim,nls,nsample,treemap,jdex,
     1  upper,avnode,bestcrit, nodestatus,nodepop,nodestart,
     1  nrnodes,nthsize,rsnodecost,ncase,parent,ut,v,iprint,
     1  xt,mtry,ip,nlim,mbest,cat,tgini)

	ndbigtree=nrnodes
	do k=nrnodes,1,-1
	if (nodestatus(k).eq.0) ndbigtree=ndbigtree-1
	if (nodestatus(k).eq.2) nodestatus(k)=-1
	end do

	do n=1,50
	!write(*,*) n,avnode(n)
	end do
	!stop
	
	call zervr(ytr,nsample)
	
	call testreebag(x,nsample,mdim,treemap,nodestatus,
     1	nrnodes,ndbigtree,ytr,upper,avnode,mbest,cat)
     
     	
    	errb=0
	jout=0
	do n=1,nsample
	if(jin(n).eq.0) then
	yptr(n)=(nout(n)*yptr(n)+ytr(n))/(nout(n)+1)
	nout(n)=nout(n)+1
	end if
	
	
	if(nout(n).gt.0) jout=jout+1
	errb=errb+(y(n)-yptr(n))**2
	end do
	errb=errb/nsample
	
	call zervr(ytree,ntest)
	
	call testreebag(xts,ntest,mdim,treemap,nodestatus,
     1	nrnodes,ndbigtree,ytree,upper,avnode,mbest,cat)
    
	
	errts=0
	do n=1,ntest
	ypred(n)=((jb-1)*ypred(n)+ytree(n))/jb
	errts=errts+(yts(n)-ypred(n))**2
	end do
	errts=errts/ntest
	
	if(jprint.eq.1) then
	if(ntest.gt.1) then
	write(*,'(i5,3f11.3)')jb,errts,errb,errb/vary
	else
	write(*,'(i5,2f11.3)')jb,errb,errb/vary
	end if
	end if
	
	if(imp.eq.1) then
	do mr=1,mdim
	call permobmr(mr,x,utr,xt,jin,nsample,mdim)
	call testreebag(x,nsample,mdim,treemap,nodestatus,
     1	nrnodes,ndbigtree,ytr,upper,avnode,mbest,cat)
	do n=1,nsample
	x(mr,n)=xt(n)
	end do
	em=0
	do n=1,nsample
	if(jin(n).eq.0) then
	predimp(n,mr)=(nout(n)*predimp(n,mr)+ytr(n))
     1  /(nout(n)+1)
	end if
	em=em+(y(n)-predimp(n,mr))**2
	end do
	errimp(mr)=em/nsample
	if(impprint.eq.1) then 
	if(100*(errimp(mr)-errb)/errb.gt.th) 
     1	write(*,*) jb,mr,100*(errimp(mr)-errb)/errb
     	end if
     	end do 
	end if
	
	end do
c	end of tree iterations=======================================

	if(ntest.gt.1) then
	write(*,'(i5,3f11.3)')jbt,errts,errb,errb/vary
	else
	write(*,'(i5,2f11.3)')jbt,errb,errb/vary
	end if
	
	
	if(imp.eq.1) then
	write(*,*) "importances"
	do m=1,mdim
	errimp(m)=100*((errimp(m)/errb)-1)
	if(errimp(m).le.0) errimp(m)=0
	write(*,*) m,errimp(m)
	end do
	end if
	
	do m=1,mdim
c	write(*,*) m,tgini(m)
	end do
	
	
	end
	
	
c	END MAIN
	

c	SUBROUTINE BUILDTREE
	
	subroutine buildtree(x,y,yl,mdim,nls,nsample,treemap,
     1  jdex,upper,avnode,bestcrit, nodestatus,
     1  nodepop,nodestart,nrnodes,nthsize,rsnodecost,
     1  ncase,parent,ut,v,iprint,xt,mtry,ip,nlim,
     1  mbest,cat,tgini)
	
	integer treemap(2,nrnodes),parent(nrnodes),
     1  nodestatus(nrnodes),ip(mdim),nodepop(nrnodes),
     1  nodestart(nrnodes),jdex(nsample),ncase(nsample),
     1  mbest(nrnodes),cat(mdim)
	
	real  	y(nsample),bestcrit(nrnodes),x(mdim,nsample),
     1  avnode(nrnodes),xt(nsample),upper(nrnodes),
     1  v(nsample),ut(nsample),rsnodecost(nrnodes),
     1  yl(nsample),tgini(mdim)
	
	
	call zerv(nodestatus,nrnodes)
	call zerv(nodestart,nrnodes)
	call zerv(nodepop,nrnodes)
	call zervr(avnode,nrnodes)
	
	
	do n=1,nsample
	ut(n)=0
	jdex(n)=n
	end do

	ncur=1
	nodestart(1)=1
	nodepop(1)=nls
	nodestatus(1)=2
	
	av=0
	ss=0
	do n=1,nls
	d=y(jdex(n))
	ss=ss+(n-1)*(av-d)*(av-d)/n
	av=((n-1)*av+d)/n
	end do !n
	avnode(1)=av
	rsnodecost(1)=ss/nls
	
c	start main loop

	do 30 kbuild=1,nrnodes

	if (kbuild.gt.ncur) goto 50
	if (nodestatus(kbuild).ne.2) goto 30
	
c		initialize for next call to findbestsplit

	ndstart=nodestart(kbuild)
	ndend=ndstart+nodepop(kbuild)-1
	nodecnt=nodepop(kbuild)
	sumnode=nodecnt*avnode(kbuild)
	jstat=0
	
	
	call findbestsplit(x,xt,ut,jdex,y,mdim,nsample,
     1  ndstart,ndend,msplit,decsplit,ubest,ncase,ndendl,
     1  jstat,v,mtry,ip,nlim,sumnode,nodecnt,yl,cat)
	
	
	iprint=0
	if(iprint.eq.1) then
	write(*,*) kbuild,decsplit,ubest
	write(*,*) ndstart,ndendl,ndend
	write(*,*) ""
	end if
	
		
	if (jstat.eq.1) then
	nodestatus(kbuild)=-1
	go to 30
	else
		mbest(kbuild)=msplit
		upper(kbuild)=ubest
		bestcrit(kbuild)=decsplit
	end if
	
	tgini(msplit)=tgini(msplit)+decsplit
	
	
c	leftnode no.= ncur+1, rightnode no. = ncur+2.

	nodepop(ncur+1)=ndendl-ndstart+1
	nodepop(ncur+2)=ndend-ndendl
	nodestart(ncur+1)=ndstart
	nodestart(ncur+2)=ndendl+1
	
	av=0
	ss=0
	do n=ndstart,ndendl
	d=y(jdex(n))
	k=n-ndstart
	ss=ss+k*(av-d)*(av-d)/(k+1)
	av=(k*av+d)/(k+1)
	end do !n
	avnode(ncur+1)=av
	rsnodecost(ncur+1)=ss/nls
	
	av=0
	ss=0
	do n=ndendl+1,ndend
	d=y(jdex(n))
	k=n-ndendl-1
	ss=ss+k*(av-d)*(av-d)/(k+1)
	av=(k*av+d)/(k+1)
	end do !n
	avnode(ncur+2)=av
	rsnodecost(ncur+2)=ss/nls


c	check on nodestatus

	nodestatus(ncur+1)=2
	nodestatus(ncur+2)=2
	if (nodepop(ncur+1).le.nthsize)
     1  nodestatus(ncur+1)=-1
	if (nodepop(ncur+2).le.nthsize)
     1  nodestatus(ncur+2)=-1

	treemap(1,kbuild)=ncur+1
	treemap(2,kbuild)=ncur+2
	parent(ncur+1)=kbuild
	parent(ncur+2)=kbuild
	nodestatus(kbuild)=1
	ncur=ncur+2
	
	if (ncur.ge.nrnodes) goto 50
	
30	continue
50	continue

	end


c	SUBROUTINE FINDBESTSPLIT

	subroutine findbestsplit(x,xt,ut,jdex,y,mdim,
     1  nsample,ndstart,ndend,msplit,decsplit,ubest,
     1  ncase,ndendl,jstat,v,mtry,ip,nlim,
     1	sumnode,nodecnt,yl,cat)
	
	integer ncase(nsample),jdex(nsample),ip(mdim),
     1  ncat(32),icat(32),cat(mdim)
		
	real x(mdim,nsample),ut(nsample),xt(nsample),
     1  v(nsample),y(nsample),yl(nsample),
     1  sumcat(32),avcat(32),tavcat(32)
	
	
c		START BIG LOOP
		
		critmax=0
200		call zerv(ip,mdim)
		non=0
		do mt=1,mtry
		critvar=0
100		kv=int(rand(1)*mdim)+1
		if(ip(kv).eq.1) goto 100
		ip(kv)=100
		lc=cat(kv)
		if(lc.eq.1) then
		do n=ndstart,ndend
		xt(n)=x(kv,jdex(n))
		yl(n)=y(jdex(n))
		end do
		
		else
		
		call zervr(sumcat,32)
		call zerv(ncat,32)
		do n=ndstart,ndend
		l=nint(x(kv,jdex(n)))
		d=y(jdex(n))
		sumcat(l)=sumcat(l)+d
		ncat(l)=ncat(l)+1
		end do
		do  j=1,lc
		if(ncat(j).gt.0) then
		avcat(j)=sumcat(j)/ncat(j)
		else
		avcat(j)=0
		end if
		end do
		do n=1,nsample
		xt(n)=avcat(nint(x(kv,jdex(n))))
		yl(n)=y(jdex(n))
		end do
		end if
		
		
		do n=ndstart,ndend
		v(n)=xt(n)
		end do
		do n=1,nsample
		ncase(n)=n
		end do
		call quicksort (v,ncase,ndstart,ndend,nsample)
		if(v(ndstart).ge.v(ndend))then
		non=non+1
		if(non.ge.3*mdim) then
		jstat=1
		return
		end if
		goto 100
		end if
		
		 
c		ncase(n)=case number of v nth from bottom

		suml=0
		sumr=sumnode
		npopl=0
		npopr=nodecnt
		
		
		do nsp=ndstart,ndend-1
		d=yl(ncase(nsp))
		suml=suml+d
		sumr=sumr-d
		npopl=npopl+1
		npopr=npopr-1
		if (v(nsp).lt.v(nsp+1)) then
		crit=(suml*suml/npopl)+(sumr*sumr/npopr)
		if (crit.gt.critvar) then
			ubestt=(v(nsp)+v(nsp+1))/2.0
			critvar=crit
			nbestt=nsp
			endif
		end if
		end do

                if(critvar.gt.critmax) then
                ubest=ubestt
                nbest=nbestt
                msplit=kv
		critmax=critvar
		do n=ndstart,ndend
		ut(n)=xt(n)
		end do
		if (cat(kv).gt.1) then
                ic=cat(kv)
                do j=1,ic
                tavcat(j)=avcat(j)
                end do
                end if
                end if
                end do

                nl=ndstart-1
		do nsp=ndstart,ndend
		if(ut(nsp).le.ubest) then
		nl=nl+1
		ncase(nl)=jdex(nsp)
		end if
		end do
		
                ndendl=max0(nl,ndstart+1)
		nr=ndendl
		do nsp=ndstart,ndend
		if(ut(nsp).gt.ubest) then
		nr=nr+1
		if(nr.gt.nsample) goto 765
		ncase(nr)=jdex(nsp)
		end if
		end do
765		continue

                if(ndendl.ge.ndend) ndendl=ndend-1


                do n=ndstart,ndend
		jdex(n)=ncase(n)
		end do
                lc=cat(msplit)
		if(lc.gt.1) then
		do j=1,lc
		if(tavcat(j).lt.ubest) then
		icat(j)=1
		else
		icat(j)=0
		end if
		end do
                call packlb(lc,icat,nubest)
		ubest=real(nubest)
		end if
		
		decsplit=critmax-(sumnode*sumnode/nodecnt)

 		end
		
	
	subroutine testreebag(x,nsample,mdim,treemap,nodestatus,
     1	nrnodes,ndbigtree,ytree,upper,avnode,mbest,cat)
	
	
	real x(mdim,nsample),
     1	upper(nrnodes),avnode(nrnodes),ytree(nsample)
	
	integer treemap(2,nrnodes),nodestatus(nrnodes),
     1	mbest(nrnodes),cat(mdim),icat(32)
     	
	
	do n=1,nsample
	kt=1
	do k=1,ndbigtree
	if(nodestatus(kt).eq.-1) then
	ytree(n)=avnode(kt)
	goto 100
	end if
	m=mbest(kt)
	lc=cat(m)
	if(lc.eq.1) then
		if (x(m,n).le.upper(kt)) then 
			kt=treemap(1,kt)
		else
			kt=treemap(2,kt)
		endif 
	else
	mm=nint(upper(kt))
	call unpacklb(lc,mm,icat)
	j=nint(x(m,n))
		if(icat(j).eq.1) then 
			kt=treemap(1,kt)
		else
			kt=treemap(2,kt)
		endif
	end if
	end do
100	continue
	end do

	end
	
	subroutine permobmr(mr,x,tp,tx,jin,nsample,mdim)
	dimension x(mdim,nsample), tp(nsample),tx(nsample),
     1  jin(nsample)
	kout=0
	call zervr(tp,nsample)
	do n=1,nsample
	if(jin(n).eq.0) then
	kout=kout+1
	tp(kout)=x(mr,n)
	end if
	end do !n
	call perm1(kout,nsample,tp)
	iout=0
	do n=1,nsample
	tx(n)=x(mr,n)
	if(jin(n).eq.0) then
	iout=iout+1
	x(mr,n)=tp(iout)
	end if
	end do
	end
	
	
c	MISCELLANOUS SMALL SUBROUTINES	

	
	subroutine zerv(ix,m1)
	integer ix(m1)
	do 10 n=1,m1
	ix(n)=0
10	continue
	end
	
	subroutine zervr(rx,m1)
	real rx(m1)
	do 10 n=1,m1
	rx(n)=0
10	continue
	end
	
	subroutine zerm(mx,m1,m2)
	integer mx(m1,m2)
	do 10 i=1,m1
	do 20 j=1,m2
	mx(i,j)=0
20	continue
10	continue
	end
	
	subroutine packlb(l,icat,npack)
	integer icat(32)
	
c	icat is a binary integer with ones for categories going left
c	and zeroes for those going right.  The sub returns npack- the integer 
c	corresponding whose binary expansion is icat.
	
	npack=0
	do 10,k=1,l
	npack=npack+icat(k)*(2**(k-1))
10	continue
	end
	
	subroutine unpacklb(l,npack,icat)
	
c	npack is a long integer.  The sub. returns icat, an integer of zeroes and
c	ones corresponding to the coefficients in the binary expansion of npack.
	
	integer icat(32)
	call zerv(icat,32)
	n=npack
	icat(1)=mod(n,2)
	do 10 k=2,l
	n=(n-icat(k-1))/2
	icat(k)=mod(n,2)
10	continue
	end
	
	
	subroutine quicksort (v,iperm,ii,jj,kk)
c
c     puts into iperm the permutation vector which sorts v into
c     increasing order.  only elementest from ii to jj are considered.
c     array iu(k) and array il(k) permit sorting up to 2**(k+1)-1 elements
c
c     this is a modification of acm algorithm #347 by r. c. singleton,
c     which is a modified hoare quicksort.
c
      dimension iperm(kk),v(kk),iu(32),il(32)
      integer t,tt
     
      m=1
      i=ii
      j=jj
 10   if (i.ge.j) go to 80
 20   k=i
      ij=(j+i)/2
      t=iperm(ij)
      vt=v(ij)
      if (v(i).le.vt) go to 30
      iperm(ij)=iperm(i)
      iperm(i)=t
      t=iperm(ij)
      v(ij)=v(i)
      v(i)=vt
      vt=v(ij)
 30   l=j
      if (v(j).ge.vt) go to 50
      iperm(ij)=iperm(j)
      iperm(j)=t
      t=iperm(ij)
      v(ij)=v(j)
      v(j)=vt
      vt=v(ij)
      if (v(i).le.vt) go to 50
      iperm(ij)=iperm(i)
      iperm(i)=t
      t=iperm(ij)
      v(ij)=v(i)
      v(i)=vt
      vt=v(ij)
      go to 50
 40   iperm(l)=iperm(k)
      iperm(k)=tt
      v(l)=v(k)
      v(k)=vtt
 50   l=l-1
      if (v(l).gt.vt) go to 50
      tt=iperm(l)
      vtt=v(l)
 60   k=k+1
      if (v(k).lt.vt) go to 60
      if (k.le.l) go to 40
      if (l-i.le.j-k) go to 70
      il(m)=i
      iu(m)=l
      i=k
      m=m+1
      go to 90
 70   il(m)=k
      iu(m)=j
      j=l
      m=m+1
      go to 90
 80   m=m-1
      if (m.eq.0) return
      i=il(m)
      j=iu(m)
 90   if (j-i.gt.10) go to 20
      if (i.eq.ii) go to 10
      i=i-1
 100  i=i+1
      if (i.eq.j) go to 80
      t=iperm(i+1)
      vt=v(i+1)
      if (v(i).le.vt) go to 100
      k=i
 110  iperm(k+1)=iperm(k)
      v(k+1)=v(k)
      k=k-1
      if (vt.lt.v(k)) go to 110
      iperm(k+1)=t
      v(k+1)=vt
      go to 100
      end
      
      	subroutine perm1(np,ns,tp)
	real tp(ns)
	j=np
11	rnd = rand(1)
	k=int(j*rnd)+1
	tx=tp(j)
	tp(j)=tp(k)
	tp(k)=tx
	j=j-1
	if(j.gt.1) go to 11
	end

	
	subroutine perm(ns,ntp)
	integer ntp(ns)
	do 1 n= 1,ns
	ntp(n)=n
1	continue        
	j=ns
11	rnd = rand1(1)
	k=int(j*rnd)+1
	jx=ntp(j)
	ntp(j)=ntp(k)
	ntp(k)=jx
	j=j-1
	if(j.gt.1) go to 11
	end
	
	
	real function rand(j)
	double precision dseed
	real u
	save dseed
	data dseed /17395/
	call lrnd(dseed,u)
	rand=u
	end

	subroutine lrnd(dseed,u)
	real u
	double precision dseed, d31m1
	data d31m1 /2147483647/
	dseed=dmod(16087*dseed,d31m1)
	u=dseed/d31m1
	return
	end
	
	real function rand1(j)
	double precision dseed1
	real u
	save dseed1
	data dseed1 /17395/
	call lrnd1(dseed1,u)
	rand1=u
	end

	subroutine lrnd1(dseed1,u)
	real u
	double precision dseed1, d31m1
	data d31m1 /2147483647/
	dseed1=dmod(16087*dseed1,d31m1)
	u=dseed1/d31m1
	return
	end
	
	
	function rnorm(j)
	double precision u,v
	u=dble(rand(1))
	v=dble(rand(1))
	rnorm=real(dsqrt(-2*dlog(u))*dcos(6.28318531*v))
	end