set more off
clear all
set scheme s1mono

 * Table 10.1
set seed 9896
use wcgs, clear
quietly logistic chd69 age chol sbp bmi smoke, coef
predict fitted, pr
roctab chd69 fitted
 
* Step 1: divide data into 10 mutually exclusive subsets
xtile group = uniform(), nq(10)
quietly gen cvfitted = .
forvalues i = 1/10 {

	* Step 2: estimate model omitting each subset
	quietly xi: logistic chd69 age chol sbp bmi smoke if group~=`i'
	quietly predict cvfittedi, pr
	
	* Step 3: save cross-validated statistic for each omitted subset
	quietly replace cvfitted = cvfittedi if group==`i'
	quietly drop cvfittedi
	}
	
* Step 4: calculate cross-validated area under ROC curve
roctab chd69 cvfitted

* Section 10.1.6

/* Dr. Mary Whooley, the PI of the Heart and Soul Study, generously let us use 
her data for the example, but could not grant us permission to post the dataset. */

* screening candidate models in terms of cross-validated C-index
capture program drop cvci
program define cvci, rclass
syntax varlist(fv) [if] [in] [, h(integer 10) reps(integer 1) ///
	seed(integer 9896) saveto(string) ]
tempname order group eta etaj c sumc 
set seed `seed'
qui gen `eta' = .
qui gen `c' = .
qui gen `sumc' = 0
qui gen `group' = .
forvalues i = 1/`reps' {
  capture drop `group'
  xtile `group' = uniform(), nq(`h')
  forvalues j = 1/`h' {
    qui stcox `varlist' if `group'~=`j'
    qui predict `etaj', xb
    qui replace `eta' = `etaj' if `group' == `j'
    drop `etaj'
    }
  qui stcox `eta'
  qui estat concordance
  qui replace `sumc' = `sumc' + r(C)
  }
* average across repetitions
qui replace `c' = `sumc'/`reps'
preserve
qui keep if _n==1 
gen CV_Cindex = `c'
gen model = "`varlist'"
keep CV_Cindex model
list, noobs noheader clean
qui append using "`saveto'"
qui save "`saveto'", replace
restore
end

cap program drop run_cvci
program define run_cvci
syntax varlist(fv) [, ds(namelist) rescreen_reps(integer 10) ntop(integer 10) ///
	basemodel(string asis) outdata(string asis) outdata2(string asis) ]

* do initial screening
* initialize fast-screening results file
clear
gen model = ""
gen CV_Cindex = .
save "`outdata'", replace

use "`ds'", clear
cvci `basemodel', saveto("`outdata'")
foreach v1 in `varlist' {
  local model = "`basemodel' `v1'"
  cvci `model', saveto("`outdata'")
  foreach v2 in `varlist' {
    if "`v1'"<"`v2'" {
      local model = "`basemodel' `v1' `v2'"
      cvci `model', saveto("`outdata'")
      foreach v3 in `varlist' {
        if "`v2'"<"`v3'" {
          local model = "`basemodel' `v1' `v2' `v3'"
          cvci `model', saveto("`outdata'")  
          foreach v4 in `varlist' {
            if "`v3'"<"`v4'" {
              local model = "`basemodel' `v1' `v2' `v3' `v4'"
              cvci `model', saveto("`outdata'")
              foreach v5 in `varlist' {
                if "`v4'"<"`v5'" {
                  local model = "`basemodel' `v1' `v2' `v3' `v4' `v5'"
                  cvci `model', saveto("`outdata'")
                  foreach v6 in `varlist' {
                    if "`v5'"<"`v6'" {
                      local model = "`basemodel' `v1' `v2' `v3' `v4' `v5' `v6'"
                      cvci `model', saveto("`outdata'")
                      foreach v7 in `varlist' {
                        if "`v6'"<"`v7'" {
                          local model = ///
                          	"`basemodel' `v1' `v2' `v3' `v4' `v5' `v6' `v7'"
                          cvci `model', saveto("`outdata'")
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
      }
    }
  }

use "`outdata'", clear
gsort -CV_Cindex
gen init_rank = _n
save "`outdata'", replace

* generate do file to run top models with more repetitions
keep if init_rank <= `ntop'
gen str command = "cvci "+model+", reps(`rescreen_reps') saveto(tempout)"
outfile command using "topmodels.do", noquote wide replace
sort model
save firstscreen, replace

* initialize re-screening results file
clear
gen model = ""
gen CV_Cindex = .
save tempout, replace

* run the do-file
use "`ds'", clear
run topmodels

use tempout, clear
sort model
merge 1:1 model using firstscreen
gsort -CV_Cindex
format CV_Cindex %5.4f
list CV_Cindex model, noobs clean
save "`outdata2'", replace

erase tempout.dta
erase firstscreen.dta
end

* use "HSRS032312.dta", clear
* select observations with complete data
foreach x in ///
	age male bmi cursmkr dm hxchf hstnt crp ntprobnp lvef50 ckdmdrd uacr {
	qui drop if `x'==.
	}
mkspline ntprobnpsp3 = ntprobnp, cubic nk(3)
mkspline uacrsp3 = uacr, cubic nk(3)
stset combine8fu, f(combine8)
save temp, replace

run_cvci male bmi cursmkr dm crp3 /*ckdmdrd hstnt hxchf physact lowmeds*/, ///
	basemodel(age ntprobnpsp* uacrsp3* lvef50) ///
	ds(temp) outdata(screen) outdata2(rescreen) 

* Table 10.2
* utility ado-file to calcalate S0, CIF, or CSH at specified time
capt program drop getblt
program define getblt
args t baseline
preserve
qui keep if _t <= `t' & `baseline'~=.	
gsort -_t
qui keep if _n == 1
scalar `baseline'`t' = `baseline'
end

capture program drop continuous
program define continuous, rclass
syntax varlist(fv) [, tau(real 5) gofgroups(integer 10) units(namelist) ///
	h(integer 10) reps(integer 10) makeplot seed(integer 9896)]
tempname order group S0 eta sumeta etaj etagroup c sumc cs sumcs
set seed `seed'

* get baseline survival function from complete data
qui stcox `varlist', basesurv(`S0')
qui estat concordance
scalar naive_Cindex = round(r(C)*100, .1)

* cross-validate model-based risks
qui gen `eta' = .
qui gen `sumeta' = 0
qui gen `c' = .
qui gen `sumc' = 0
qui gen `cs' = .
qui gen `sumcs' = 0
qui gen `group' = .
forvalues i = 1/`reps' {
	capture drop `group'
	xtile `group' = uniform(), nq(`h')
	forvalues j = 1/`h' {
		qui stcox `varlist' if `group'~=`j'
		qui predict `etaj', xb
		qui replace `eta' = `etaj' if `group' == `j'
		qui replace `sumeta' = `sumeta' + `etaj' if `group' == `j'
		drop `etaj'
		}
	qui stcox `eta'
	qui estat concordance
	qui replace `sumc' = `sumc' + r(C)
	qui replace `sumcs' = `sumcs' + _b[`eta']
	}
* average across repetitions
qui replace `eta' = `sumeta'/`reps'
qui replace `c' = `sumc'/`reps'
qui replace `cs' = `sumcs'/`reps'
preserve
collapse `c' `cs'
scalar CV_C_index = round(`c'*100, .1)
scalar calibration_slope = round(`cs', .01)
restore

* calculate model-based risks using aggregate baseline survival 
* and averaged cross-validated linear predictor 
getblt `tau' `S0'
qui gen risk`tau' = 1 - `S0'`tau'^exp(`eta') 

* GOF test
xtile `etagroup' = `eta', nq(`gofgroups')
label variable `etagroup' "Model Based Risk"
qui stcox `eta'
estimates store baseline
qui stcox `eta' i.`etagroup'
qui lrtest baseline
scalar p_value = round(r(p), .1)

* histogram of model-based and K-M risks
if("`makeplot'"~="") {
	qui sts list, failure by(`etagroup') at(1/`tau') saving(kmout, replace)
	preserve
	use kmout, clear
	qui keep if time==`tau'
	sort `etagroup'
	qui gen kmrisk`tau' = failure*100
	keep `etagroup' kmrisk`tau'
	qui save kmout, replace
	restore
	sort `etagroup'
	qui merge m:1 `etagroup' using kmout
	erase kmout.dta
	qui replace risk`tau'= risk`tau'*100
	graph bar (mean) risk`tau' kmrisk`tau', over(`etagroup') ///
		legend(order(1 "Model-based" 2 "Kaplan-Meier") ring(0) pos(1)) ///
		ytitle("`tau'-`units' Risk (%)") yscale(range(0 100)) ylabel(0(20)100) ///
		text(100 0 "Continuous Model", placement(se) size(large)) ///
		name(continuous_barplot, replace)
	}
scalar list calibration_slope p_value naive_Cindex CV_C_index
end

capture program drop pointscore
program define pointscore, rclass
syntax varlist(fv) [, tau(real 5) units(namelist) ///
	maxscore(integer 9) h(integer 10) reps(integer 10) ///
	makeplot seed(integer 9896) ]
	
tempname order group S0 minb point_score sum_ps ps_j ps ps_group eta c sumc
set seed `seed'
qui stcox `varlist', nohr
preserve
qui keep if _n==1
gen `minb' = 10
foreach x in `varlist' {
	scalar b_`x' = _b[`x']
	qui replace `minb' = b_`x' if b_`x'< `minb'
	}
* dis ""
* dis "Points for each variable"
foreach x in `varlist' {
	scalar points_`x' = round(b_`x'/`minb')
	}
restore 
qui gen `point_score' = 0
foreach x in `varlist' {
	qui replace `point_score' = `point_score' + `x' * points_`x'
	}
	
* stratification variable based on aggregation of point scores
qui recode `point_score' `maxscore'/max=`maxscore', gen(`ps_group')
label variable `ps_group' "Point score"
local lastscore = `maxscore'-1
label define points `maxscore' ">`lastscore'"
label values `ps_group' points

* refit model using point scores
qui stcox `point_score', basesurv(`S0') 

* naive C-index
qui estat concordance
scalar naive_C_index = round(r(C)*100, .1)

* cross validate the point scores
qui gen `order' = .
qui gen `group' = .
qui gen `c' = .
qui gen `sumc' = 0
qui gen `ps' = .
qui gen `sum_ps' = 0
forvalues i = 1/`reps' {
	capture drop `group'
	qui replace `order' = uniform()
	qui xtile `group' = `order', nq(`h')
	forvalues j = 1/`h' {
		qui stcox `varlist' if `group'~=`j'
		preserve
		qui keep if _n==1
		gen `minb' = 10
		foreach x in `varlist' {
			scalar b_`x' = _b[`x']
			qui replace `minb' = b_`x' if b_`x'< `minb'
			}
		scalar k = 2.999/(`maxb'-`minb')
		foreach x in `varlist' {
			scalar points_`x' = round(b_`x'/`minb')
			}
		restore 
		qui gen `ps_j' = 0
		foreach x in `varlist' {
			qui replace `ps_j' = `ps_j' + `x' * points_`x'
			}	
		qui replace `ps' = `ps_j' if `group' == `j'
		qui drop `ps_j'
		}
	qui stcox `ps'
	qui estat concordance
	qui replace `sumc' = `sumc' + r(C)
	qui replace `sum_ps' = `sum_ps' + `ps'
	}
* average point scores across repetitions
qui replace `c' = `sumc'/`reps'
qui replace `ps' = `sum_ps'/`reps'

* re-fit model using CV point scores
qui stcox `ps'

* CV C index
preserve
collapse `c'
scalar CV_C_index = round(`c'*100, .1)
restore

* Model-based risk using CV point scores
qui predict `eta', xb
getblt `tau' `S0'
qui gen risk`tau' = 1 - `S0'`tau'^exp(`eta') 

* CV GOF test
estimates store baseline
qui stcox `ps' i.`ps_group'
qui lrtest baseline
scalar p_value = round(r(p), .01)

* histogram of predicted and K-M risks
if("`makeplot'"~="") {
	qui sts list, failure by(`ps_group') at(1/`tau') saving(kmout, replace)
	preserve
	use kmout, clear
	qui keep if time==`tau'
	sort `ps_group'
	qui gen kmrisk`tau' = failure*100
	keep `ps_group' kmrisk`tau'
	qui save kmout, replace
	restore
	sort `ps_group'
	qui merge m:1 `ps_group' using kmout
	erase kmout.dta
	replace risk`tau'= risk`tau'*100
	graph bar (mean) risk`tau' kmrisk`tau', over(`ps_group') ///
		legend(order(1 "Model-based" 2 "Kaplan-Meier") ring(0) pos(1)) ///
		ytitle("`tau'-`units' Risk (%)") yscale(range(0 100)) ylabel(0(20)100) ///
		text(100 0 "Point Score Model", placement(se) size(large)) ///
		name(pointscore_barplot, replace)
	}
scalar list p_value naive_C_index CV_C_index
end

set graphics off
use temp, clear
continuous age lvef50 ntprobnpsp31 ntprobnpsp32 uacrsp31 uacrsp32 cursmkr ///	
	, tau(5) units(Year) 
use temp, clear	
continuous age lvef50 ntprobnpsp31 ntprobnpsp32 uacrsp31 uacrsp32 cursmkr dm ///	
	, tau(5) units(Year) makeplot
use temp, clear	
continuous age lvef50 ntprobnpsp31 ntprobnpsp32 uacrsp31 uacrsp32 cursmkr tnt ///	
	, tau(5) units(Year) 
use temp, clear	
pointscore lvef50 bnp500 uacr30 age65up cursmkr ///	
	, tau(5) units(Year) maxscore(9)
use temp, clear	
pointscore lvef50 bnp500 uacr30 age65up cursmkr dm ///	
	, tau(5) units(Year) maxscore(9) ///
	makeplot
use temp, clear	
pointscore lvef50 bnp500 uacr30 age65up cursmkr dm hxchf ///	
	, tau(5) units(Year) maxscore(9)

erase temp.dta
set graphics on
qui graph combine continuous_barplot pointscore_barplot, rows(2)

	
