# When viewing this post the code looks messy, but just copy and paste into

# notepad or the R scripting window and it should line up

# This program is a little more complicated than the previous ones,and some of

# the topics may be a little out of the scope of this class, but if you

# are interested and don't understand something let me know

# example plots from Part 2 of the program using sample salary data:

# histogram of salary data with gaussian KDE plot

# KDE plots of salary data with varying bandwidth selections using the 'adjust'

# option

# PROGRAM NAME: KDE_R

# DATE: 6/18/2010

# AUTHOR : MATT BOGARD

# PURPOSE: EXAMPLE OF KERNAL DENSITY ESTIMATES IN R

# DATA USED: SIMULATED WITHIN SCRIPT

# PACKAGES: lattice Hmisc

# OUTPUT FILE: "C:\Documents and Settings\wkuuser\My Documents\TOOLS AND REFERENCES\R References"

# UPDATES:

#

#

# COMMENTS: GIVE A BASIC UNDERSTANDING OF THE density() FUNCTION IN R

# AND BANDWIDTH, KERNAL SELECTION, AND PLOTTING

#

# REFERENCES: ADAPTED FROM

# http://ise.tamu.edu/metrology/Group%20Seminars/Kernel_Density_Estimation_2008JUN27_2.pdf

#

# CONTENTS: PART1 EXAMPLES USING CODE GENERATED NORMALLY DISTRIBUTED DATA

# PART2 EXAMPLES USING SAMPLE DATA

#---------------------------------------------------------------#

# PART1 EXAMPLES USING CODE GENERATED NORMALLY DISTRIBUTED DATA

#---------------------------------------------------------------#

x <- seq(-4, 4, 0.01) y <- dnorm(x, mean=0, sd=1 # create x,y data set that is exactly normal

rn <- rnorm(1000, mean=0, sd=1 # create a set of random normal values

# plot the density of the random numbers you just generated

# let the bandwidth = nrd0 "rule of thub" h= .9*A*n*exp(-1.5), A = min{s,IQR/1.34}

# select the gaussian kernal using 1000 points (which you just generated)

plot(density(x=rn, bw="nrd0", kernel="gaussian", n=1000), col=5) # gaussian is default

lines(x, y) # plot the exactly normal data set you created earlier to compare

# plot a series of KDE plots with varying bandwitdth using 'width='option

# When this option is specified instead of 'bw',and is a number, the density

# function bandwidth is set to a kernel dependent multiple of 'width'

for(w in 1:3)

lines(density(x=rn, width=w, kernel="gaussian", n=1000), col=w+1)

# what if we change the kernal

# first close the graphics window to start over

# look at varius bandwidths using a kernal other than gaussian

# KDE otpions:

# gaussian rectangular triangular

# biweight cosine optcosine"

# epanechnikov

plot(density(x=rn, bw="nrd0", kernel="rectangular", n=1000), col=1)

for(w in 1:3)

lines(density(x=rn, width=w, kernel="rectangular", n=1000), col=w+1)

# or to just look at various densities in one plot

# using a lines statment specifying the specific kernal

plot(density(x=rn, bw="nrd0", kernel="gaussian", n=1000), col=1)

lines(density(x=rn, bw="nrd0", kernel="rectangular", n=1000), col=2)

lines(density(x=rn, bw="nrd0", kernel="triangular", n=1000), col=3)

lines(density(x=rn, bw="nrd0", kernel="epanechnikov", n=1000), col=4)

lines(density(x=rn, bw="nrd0", kernel="biweight", n=1000), col=5)

lines(density(x=rn, bw="nrd0", kernel="cosine", n=1000), col=6)

lines(density(x=rn, bw="nrd0", kernel="optcosine", n=1000), col=7)

# or to just look at various densities in one plot

# using a lines statment specifying the specific kernal

plot(density(x=rn, bw="nrd0", kernel="gaussian", n=1000), col=1)

# note if you want the default gaussian, it does not have to be specified

plot(density(x=rn, bw="nrd0", n=1000), col=1)

#-----------------------------------#

# PART2 EXAMPLES USING SAMPLE DATA

#-----------------------------------#

# load salary data

salary<-c(240,240,240,240,240,240,240,240,255,255,265,265,280,280,290,300,305,325,330,340)

print(salary)

library(Hmisc) # load for describe function

describe(salary) # general stats

library(lattice) # load lattice package for histogram graphics

hs <- hist(salary) # plot histogram and store data

print(hs)

# create a kde of salary data (gaussian)and store it as dens

dens <- density(x=salary, bw="nrd0", n=20)

print(dens)

rs <- max(hs$counts/max(dens$y)) # create scaling factor for plotting the density

lines(dens$x, dens$y*rs, col=2) # plot density over histogram using lines statment

# compare this to just plotting the non-rescaled density over the histogram

lines(density(x=salary, bw="nrd0", n=20),col=3) # green line barely shows up

# also, compare to just plotting the estimated density for the sample

# data in a separate plot

plot(density(x=salary, bw="nrd0", n=20)) # or equivalently

plot(dens, col=3)

# now that we understand the various ways we can plot a specified KDE

# we can apply the tools used in part 1 to look at the effects of

# various bandwidths and kernal choices - first close the graphing window

# what if we change the kernal

# first close the graphics window to start over

# look at varius KDEs

# KDE otpions:

# gaussian rectangular triangular

# biweight cosine optcosine"

# epanechnikov

# look at various densities in one plot

# using a lines statment specifying the specific kernal

plot(density(x=salary, bw="nrd0", kernel="gaussian", n=20), col=1)

lines(density(x=salary, bw="nrd0", kernel="rectangular", n=20), col=2)

lines(density(x=salary, bw="nrd0", kernel="triangular", n=20), col=3)

lines(density(x=salary, bw="nrd0", kernel="epanechnikov", n=20), col=4)

lines(density(x=salary, bw="nrd0", kernel="biweight", n=20), col=5)

lines(density(x=salary, bw="nrd0", kernel="cosine", n=20), col=6)

lines(density(x=salary, bw="nrd0", kernel="optcosine", n=20), col=7)

# what if we don't specify a bandwidth (clear graphics window)

# what is the default density

plot(density(x=salary, bw="nrd0", kernel="gaussian", n=20), col=1)

lines(density(x=salary, kernel="gaussian", n=20), col=2)# nrd0 appears to be default

# what if we change the default

# nrd (a variation of nrd0)

# ucv (unbiased cross validation)

# bcv (cross validation)

lines(density(x=salary, bw="ucv", kernel="gaussian", n=20), col=2)

lines(density(x=salary, bw="bcv", kernel="gaussian", n=20), col=3)

lines(density(x=salary, bw="nrd", kernel="gaussian", n=20), col=4)

# use print option to see how the bw value changes with each method

print(density(x=salary, bw="nrd0", kernel="gaussian", n=20))

# look at the differences in calculated

# bandwidth and plots with the adjust option

for(w in 1:3)

print(density(x=salary, bw="nrd0", adjust=w, kernel="gaussian", n=20))

plot(density(x=salary, bw="nrd0",adjust=1, kernel="gaussian", n=20), col=1)

lines(density(x=salary, bw="nrd0",adjust=2,kernel="gaussian", n=20), col=2)

lines(density(x=salary, bw="nrd0",adjust=3,kernel="gaussian", n=20), col=3)