An Interactive Introduction to R for Actuaries
CAS Conference November 2009
Michael E. Driscoll, Ph.D. Daniel Murphy FCAS, MAAA
An Interactive Introduction to R for Actuaries CAS Conference - - PowerPoint PPT Presentation
An Interactive Introduction to R for Actuaries CAS Conference November 2009 Michael E. Driscoll, Ph.D. Daniel Murphy FCAS, MAAA January 6, 2009 R is a tool for Data Manipulation connecting to data sources slicing & dicing data
Michael E. Driscoll, Ph.D. Daniel Murphy FCAS, MAAA
January 6, 2009
> 2+2 4
> x <- 2+2 2+2 ## „<-‟ is R syntax for „=‟ or assignment > x^2 16 16
> weight <- c(110, 180, 240) ## three weights > height <- c(5.5, 6.1, 6.2) ## three heights > > bmi <- (weight*4.88)/height^2 ## divides element-wise 17.7 23.6 30.4
mean(weight) sd sd(weight) (weight) sqrt sqrt(var var(weight)) 176.6 65.0 65.0 # same as sd sd
union intersect setdiff
> > pbinom pbinom(40, 100, 0.5) ## P that a coin tossed 100 times 0.028 ## that comes up 40 heads is „fair‟ > > pshare pshare <- pbirthda pbirthday(23, 365, coincident=2) 0.530 ## proba
## probabilit bility tha y that among t among 23 pe 23 people,
two share hare a a birthday birthday
> 2 + 2 [Hit ENTER] > log(1 (100 00) ) [Hit ENTER]
> 100 * exp(0 (0.0 .05*1 *10) ) [Hit ENTER]
> year r <- (1,2, 2,5,1 ,10,2 ,25) 5) [Hit ENTER] this returns an error. why? > year r <- c(1,2 ,2,5, 5,10, 0,25 25) ) [Hit ENTER] > 100 * exp(0 (0.0 .05*y *year ar) ) [Hit ENTER]
> h head ads < <- rb rbino nom(1 (10^5 ^5,10 100,0 ,0.50 50) > hist(heads)
> pnorm(0) 0.05 > qnorm(0.9) 1.28 > rnorm(100) vector of length 100
distribution dist suffix in R Beta
Binomial
Cauchy
Chisquare
Exponential
F
Gamma
Geometric
Hypergeometric
Logistic
Lognormal
Negative Binomial
Normal
Poisson
Student t
Uniform
Tukey
Weibull
Wilcoxon
How to find the functions for lognormal distribution? 1) Use the double question mark ‘??’ to search > ??lognormal > ??lognormal 2) Then identify the package > ?Lognor normal mal 3) Discover the dist functions
dln lnorm rm, p pln lnor
, qln lnorm rm, rln lnorm rm
> > nu numc mclai aims ms <- rp rpoi
(n, l lamb mbda) a) (hint: use ?rpois to understand the parameters)
> mean(numclaims) > > va var(num umcl clai aims)
> > hist(numclaims)
> Insurance <- read.csv(“Insurance.csv”,header=TRUE)
> con <- dbConnect(driver,user,password,host,dbname) > Insurance <- dbSendQuery(con, “SELECT * FROM claims”)
> > con <
con <- url('http://labs.dataspora.com/test.txt') > Insurance <- read.csv read.csv(con, (con, header=TRU header=TRUE) E)
> load(„Insurance.RData‟)
write.csv(Insurance,file=“Insurance.csv”)
con <- dbConnect(dbdriver,user,password,host,dbname) dbWriteTable(con, “Insurance”, Insurance)
save(Insurance, file=“Insurance.RData”)
> ls()
> s str( r(x) > head(x) > t tail il(x) x) > class(x)
> rm(x) > rm(x)
li libr brary ry(MA MASS SS) he head ad(In Insur uran ance ce) ## # th the f fir irst t 7 r row
di dim( m(Ins nsura ranc nce) e) ## # nu numbe ber r of f row
s & & col
mns
wr writ ite.c .csv( v(In Insu suran ance, e,fi file=“Insurance.csv”, ro rownam ames es=FA FALSE SE) getwd getwd() () # ## # wh where re am am I I?
re remo move ve th the e fi first st di dist stric ict
read.csv(Insurance, file=“Insurance.csv”) plo lot(C (Clai aims ms/H /Hold lders rs ~ ~ Age ge, d data ta=I =Ins nsura rance ce)
df[city == “New York”,]
subset()
res eshap ape() ()
transform() transform()
m <- lm(Claims ~ Age, data=Insurance)
sum ummar ary(m (m)
plo lot(m (m)
m <- logit (Claims ~ Age, data=Insurance)
sum ummar ary(m (m)
plo lot(m (m)
m <- lm(Claims/Holders ~ Age + 0, data=Insurance)
summary(m)
plot(m) plot(m)
library(ggplot2) qplot(Group, Claims/Holders, data=Insurance, geom="bar", stat='identity', position="dodge", facets=District ~ ., fill=Age)
library(ggplot2) qplot(Age, Claims/Holders, data=Insurance, geom="boxplot“) library(lattice) bwplot(Claims/Holders ~ Age, data=Insurance)
library(lattice) densityplot(~ Claims/Holders | Age, data=Insurance, layout=c(4,1) library(ggplot2) qplot(Claims/Holders, data=Insurance, facets=Age ~ ., geom="density")
> x <- 1:10 1:10 > y y <- x^2 x^2 > p plot
y ~ ~ x) x)
> l libr brary ry(l (lat attic ice) > > boxplot(Claims/Holders ~ Age, data=Insurance)
> > abline abline() ()
help(func) > ?func > ?func
> help.search(topic) > help.search(topic) > ??topic > ??topic
http://www.casact.org/newsletter/index.cfm?fa=viewart&id=5756
p p
p p
) ( , 1 2 ,
2 1
= + + = =
Visualization Learning R Statistical Modeling Data Manipulation
P&C Actuarial Models Design • Construction Collaboration • Education Valuable • Transparent Daniel Murphy, FCAS, MAAA dmurphy@trinostics.com 925.381.9869
32
Michael E. Driscoll, Ph.D. www.dataspora.com San Francisco, CA 415.860.4347
x < <- c( c(0, 0,2: 2:4) 4) y < <- c(“alpha”, “b”, “c3”, “4”) z < <- c( c(1, 1, 0 0, , TRU RUE, , FAL ALSE) E)
> class(x) [1] 1] "n "nume meri ric" c" > x2 <- as.logical(x) > c clas ass(x (x2) 2) [1] “logical”
lst st <- li list st(x (x,y, y,z) M < <- ma matr trix ix(r (rep( p(x,3 ,3),n ,ncol
3) df f <- dat ata. a.fr frame me(x, x,y,z ,z)
> class(df df) [1] “da data ta.f .fra rame"
?
qpl plot(l (log
(car arat) t), l log( g(pri rice) e), d dat ata a = d diam amond nds, , alpha=I(1/20)) + facet_grid(. ~ color)
(source: http://lmdvr.r-forge.r-project.org )
den ensit itypl plot
(~ sp speed ed | | typ ype, , dat ata= a=pi pitch ch)
xyp yplot
x ~ ~ y, y, da data= a=pit itch) h)
xyp yplot
x ~ ~ y, y, gr group ups=t =type pe, d data ta=p =pit itch) h)
xyp yplot
x ~ ~ y y | t type pe, d data ta=pi pitch ch)
xyplot(x ~ y | (x ~ y | type, da type, data=pit ta=pitch, ch, fill.color = = pitch$color, panel = function(x,y x,y, , fill.color, …, subscripts) { fill <- fill.color[subscripts] panel.xyplot(x,y x,y, fill= fill, …) })
xyplot(x ~ y | (x ~ y | type, da type, data=pit ta=pitch, ch, fill.color = = pitch$color, panel = function(x,y x,y, , fill.color, …, subscripts) { fill <- fill.color[subscripts] panel.xyplot(x, y, fill= fill, …) })
library(“Colorspace”) red <- LAB(50,64,64) blue <- LAB(50,-48,-48) mixcolor(10, red, blue)
hexbinplot(log(price)~log(carat),data=diamonds,xbins=40)