# ThinkStats in R by @hrbrmstr # Example 2-1 # File format info: http://www.cdc.gov/nchs/nsfg/nsfg_cycle6.htm # setup a data frame that has the field start/end info pFields <- data.frame(name = c('caseid', 'nbrnaliv', 'babysex', 'birthwgt_lb','birthwgt_oz','prglength', 'outcome', 'birthord', 'agepreg', 'finalwgt'), begin = c(1, 22, 56, 57, 59, 275, 277, 278, 284, 423), end = c(12, 22, 56, 58, 60, 276, 277, 279, 287, 440) ) # calcuate widthds so we can pass them to read.fwf() pFields$width <- pFields$end - pFields$begin + 1 # we aren't reading every field (for the book exercises) pFields$skip <- (-c(pFields$begin[-1]-pFields$end[-nrow(pFields)]-1,0)) widths <- c(t(pFields[,4:5])) widths <- widths[widths!=0] # read in the file pregnancies <- read.fwf("2002FemPreg.dat", widths) # assign column names names(pregnancies) <- pFields$name # divide mother's age by 100 pregnancies$agepreg <- pregnancies$agepreg / 100 # convert weight at birth from lbs/oz to total ounces pregnancies$totalwgt_oz = pregnancies$birthwgt_lb * 16 + pregnancies$birthwgt_oz rFields <- data.frame(name = c('caseid'), begin = c(1), end = c(12) ) rFields$width <- rFields$end - rFields$begin + 1 rFields$skip <- (-c(rFields$begin[-1]-rFields$end[-nrow(rFields)]-1,0)) widths <- c(t(rFields[,4:5])) widths <- widths[widths!=0] respondents <- read.fwf("2002FemResp.dat", widths) names(respondents) <- rFields$name # exercise 1-1 # not exactly the same, but even more info is provided in the summary from str() str(respondents) str(pregnancies) # for exercise 1-2 # use subset() on the data frames # again, lazy use of str() for output str(livebirths) livebirths <- subset(pregnancies,outcome==1) # exercise 1-3 firstbabies <- subset(pregnancies,birthord==1 & outcome==1) notfirstbabies <- subset(pregnancies,birthord > 1 & outcome==1) str(firstbabies) str(notfirstbabies) # exercise 1-4 mean(firstbabies$prglength) mean(notfirstbabies$prglength) # example 2-2 sd(firstbabies$prglength) sd(notfirstbabies$prglength) # example 2-3 # hack to fake "mode" names(sort(-table(firstbabies$prglength))[1]) # figure 2-1 par(mfrow=c(1,2)) hist(firstbabies$prglength, cex.lab=0.8, cex.axis=0.6, cex.main=0.8, las=1, col="white", ylim=c(0,3000),xlim=c(17,max(firstbabies$prglength)), breaks="Scott", main="Histogram of first babies", xlab="Weeks") hist(notfirstbabies$prglength, cex.lab=0.8, cex.axis=0.6, cex.main=0.8, las=1, col="blue", ylim=c(0,3000),xlim=c(17,max(notfirstbabies$prglength)), breaks="Scott", main="Histogram of other babies", xlab="Weeks") par(mfrow=c(1,1))