# ThinkStats in R by @hrbrmstr
# Example 2-1
# File format info: http://www.cdc.gov/nchs/nsfg/nsfg_cycle6.htm

# setup a data frame that has the field start/end info

pFields <- data.frame(name  = c('caseid', 'nbrnaliv', 'babysex', 'birthwgt_lb','birthwgt_oz','prglength', 'outcome', 'birthord',  'agepreg',  'finalwgt'), 
                      begin = c(1, 22, 56, 57, 59, 275, 277, 278, 284, 423), 
                      end   = c(12, 22, 56, 58, 60, 276, 277, 279, 287, 440) 
) 

# calcuate widthds so we can pass them to read.fwf()

pFields$width <- pFields$end - pFields$begin + 1 

# we aren't reading every field (for the book exercises)

pFields$skip <-  (-c(pFields$begin[-1]-pFields$end[-nrow(pFields)]-1,0)) 

widths <- c(t(pFields[,4:5])) 
widths <- widths[widths!=0] 

# read in the file

pregnancies <- read.fwf("2002FemPreg.dat", widths) 

# assign column names

names(pregnancies) <- pFields$name 

# divide mother's age by 100

pregnancies$agepreg <- pregnancies$agepreg / 100

# convert weight at birth from lbs/oz to total ounces

pregnancies$totalwgt_oz = pregnancies$birthwgt_lb * 16 + pregnancies$birthwgt_oz

rFields <- data.frame(name  = c('caseid'), 
                      begin = c(1), 
                      end   = c(12) 
) 

rFields$width <- rFields$end - rFields$begin + 1 
rFields$skip <-  (-c(rFields$begin[-1]-rFields$end[-nrow(rFields)]-1,0)) 

widths <- c(t(rFields[,4:5])) 
widths <- widths[widths!=0] 

respondents <- read.fwf("2002FemResp.dat", widths) 
names(respondents) <- rFields$name

# exercise 1-1
# not exactly the same, but even more info is provided in the summary from str()

str(respondents)
str(pregnancies)

# for exercise 1-2
# use subset() on the data frames
# again, lazy use of str() for output

str(livebirths)

livebirths <- subset(pregnancies,outcome==1)

# exercise 1-3

firstbabies <- subset(pregnancies,birthord==1 & outcome==1)
notfirstbabies <- subset(pregnancies,birthord > 1 & outcome==1)

str(firstbabies)
str(notfirstbabies)

# exercise 1-4

mean(firstbabies$prglength)
mean(notfirstbabies$prglength)

# example 2-2

sd(firstbabies$prglength)
sd(notfirstbabies$prglength)

# example 2-3
# hack to fake "mode"

names(sort(-table(firstbabies$prglength))[1])

# figure 2-1

par(mfrow=c(1,2))
hist(firstbabies$prglength, cex.lab=0.8, cex.axis=0.6, cex.main=0.8, las=1, col="white", ylim=c(0,3000),xlim=c(17,max(firstbabies$prglength)), breaks="Scott", main="Histogram of first babies", xlab="Weeks")
hist(notfirstbabies$prglength, cex.lab=0.8, cex.axis=0.6, cex.main=0.8, las=1, col="blue", ylim=c(0,3000),xlim=c(17,max(notfirstbabies$prglength)), breaks="Scott", main="Histogram of other babies", xlab="Weeks")
par(mfrow=c(1,1))