setwd("D:\R Language Learning")#Change working path
write.table(y, "sample.csv", sep=",")#Save the file
> x=scan()#Manual Input 1: 1 2: 2 3: 3 4: 5 5: Read 4 items > x [1] 1 2 3 5 x=scan("a.txt")
as.array() as.character() as.data.frame() as.matrix()
is.array() is.character() is.data.frame()
d = density(), dnorm() p = distribution function pnorm() q =Quantile functions quantile(), qnorm() r = generate random number rnorm()
str Function#Show Object Internal Structure
^ or ** power
A/B General Division
A%%B Remainder, Modular Operation
A%/%B Integer Division
&and
| or
!wrong
ceiling(X) #ceil floor(x)#Rounding Down trunc(5.54)#Take the integer part 5 of x in the direction of 0 round(x,digits = 2) Keep two decimal places 3.45184 =>3.45 signif(x,digits = n) Keep as n Bit Significant Number 3.45184 =>3.5 log(x,base = n) with n Bottom Logarithm > log(2,base =exp(1) ) [1] 0.6931472
mean() weight.mean(d,c(1,2,4,5,6)) #Weighted Average median() sd() var() mad()Absolute Median Difference # For example, if you have a data sample set X={2 3 8 7 9 6 4}, # At this point the median of the data is 6, # The original data minus the median to find the absolute value to form a new data sample of {4 3 2 1 3 0 2}. # The median of the new data sample is 2, so the absolute median difference of the original data sample set is 2.
quantile(x,probs = c(0.25,0.75),names = T,type = )Quantile names Is to show 50% 50,Or 50, type There are nine choices fivenum(x) #Find five quantile values, equivalent to quantile(x,names=F,type=1) IQR(Quartile spacing) prod(1:5) #multiply continuously factorial(5) #Factorial function 5! range()range diff(x,lag = n)Lag Score min() max() any(x>5) #any condition all(x>5) #all Conditions any()The function determines if at least one of these values is TURE. all()The function is similar in that it determines if all of these values are TRUE. choose(5,2) #Two out of five, with several choices C(25) rank()
combn() #Used to produce a combination of set elements, such as finding a subset of sets 1,2,3 that contain two elements, the output being a matrix arranged in columns combn(1:3,2) #Output results are sorted by column and are of type matrix [,1] [,2] [,3] #All Possibilities of 1,2,3 [1,] 1 1 2 [2,] 2 3 3
cumsum(1:5) #accumulation cumprod(1:5) #Multiplication cummin(x) #Minimum cumulative, equivalent to taking the minimum value of a variable from left to right cummax(x) #Maximum cumulative, equivalent to taking the maximum value of a variable from left to right
intersect(x,y) #intersect union(x,y) #Union setdiff(x,y) #Take the difference set and drain y from x setequal(x,y) #Determine whether two vectors are equal unique(x) #Take Unique Value which(duplicated(x)) #Index to find duplicate elements duplicated()#The function is to remove duplicate data from the data frame, if deleting duplicate data Kpil<-exceldata[!duplicated(exceldata[,c("tian","yin")])]; pmin()\pmax() #Compares vectors of equal length one by one and returns the minimum or maximum value of the k-th element of all vectors sort(x,decreasing = F)Sort Ascending order(x,decreasing = T) Descending order F Is ascending pi Circumference
rug()Add the location of the data point below the graph ftable() xtabs() prod=T Vertical coordinate units in probability
mode mode Lieutenant General integer and double Show as numeric class mode Refers to the types of variables such as numeric, character, logical, etc. class Refers to categories of variables such as matrices, lists, data frames typeof typeof Is a subdivision of variable types > 2^(1:5) [1] 2 4 8 16 32 > class(x) [1] "integer" > class(v) [1] "data.frame" > typeof(x) [1] "integer" > typeof(v) [1] "list"
names(cars)View variable names tail()View the last row.names() > attributes(cars) #View Properties $names [1] "speed" "dist" $class [1] "data.frame" $row.names [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 [19] 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 [37] 37 38 39 40 41 42 43 44 45 46 47 48 49 50 ncol(cars) Number of columns nrow(cars) Number of rows dim(cars)
scale standardization
x<-c(1,2,3,4) xx<-scale(x) scale(x,scale = F) #Refers to the subtraction of the mean for each element of x without dividing by the variance scale(x,center=TRUE,scale=F) #Centralize data object x by column (that is, minus the average) scale(x,center = F) #Divide each element directly by its variance without subtracting the mean scale(x,center=T,scale=T) #Meanwhile (i.e. standardization) (mean 0, standard deviation 1) #If you want to standardize any mean and standard deviation for each column scale(x,center=TRUE,scale=TRUE)*sd+M newdata<-scale(marks$math)*10+10 #The scale function is used to standardize the column with a standard deviation of 10 and a mean of 10. //Use scale(x)*sd+mean
Complex Mathematical Functions
- trigonometric function
sec() #Secant, slope to adjacent ratio csc() #Cosecant, slope versus edge tan() #Tangent, ratio of opposite edge to adjacent edge
- Inner and Outer Product
#x,y are vectors, calculating the internal product (quantity product, dot product, scalar product) of two vectors: x.y=|x|*|y|*cos(x,y), where |x| denotes the vector size sqrt(sum(x**2)) crossprod(x,y) or X%*%y #Vector Outer Product Definition: Vector Product, Vector Product, Cross Product, Note a *b. If A and B are not collinear, the modulus of a *b is: |a*b=|a|b|sin(a,b); the direction of a*b follows the right-hand rule: perpendicular to a and b, and a, B and a*b form a right-handed system in this order. If a, B are collinear, a*b=0 X%o% y or tcrossprod(x,y)
If you need to find the maximum and minimum of a function, you need the nlm() and optim() functions. For example, if you want to find the minimum of f(x)=x^2+sin(x) nlm(function(x) return(x^2+sin(x)),0) Where the output is the minimum of $minimum, $estimate value of the transverse x, $iterations iteration number, 0 set initial value
Random Numbers and Sampling Simulation
- set.seed
set.seed(i) in the r language can be any number, and it serves as a number The set.seed() function is designed to ensure that the random numbers you randomly generate are consistent. x=rnorm(5) [1] -0.4657179 -0.8252361 -0.1893222 -0.8956596 1.5736333 Once more x=rnorm(5) [1] -0.5802258 -0.2242277 -0.5897595 0.6455071 0.5062776 Each rerun results are different When seeds are planted, the results are the same for each repetition, with a number i representing a result set.seed(5) x=rnorm(5) [1] -0.84085548 1.38435934 -1.25549186 0.07014277 1.71144087 Rerun set.seed(5) x=rnorm(5) [1] -0.84085548 1.38435934 -1.25549186 0.07014277 1.71144087 The result is the same, is it the same when set.seed(5) is replaced by set.seed(2)?It's obviously different, so the parameters in set.seed() brackets can be any number. It's the seed number you set up on behalf of you. It doesn't participate in the operation. It's just a marker.
-
runif(n,min=,max=) uniformly distributed random numbers; omitting min and Max means uniformly distributed random numbers on [0,1], resulting in N random numbers
-
Random sampling
sample(x,n,replace=T,prob=null) playback versus no playback sampling
Where x represents the population of the sample (population vector, which can be character, numeric, logical, etc.);
n denotes the sample size, i.e., how many times a sample has been taken;
replace=F, default to no playback sampling;
prob can set the sampling probability for each different sampling unit to sample with unequal probability
x=1:100
Sample (x, 20, replace =, size =, prob =)#sampling -
Majority calculation: names (which.max (table(x))#x is a numeric vector
-
#Functions for calculating kurtosis and skewness:
Mtcarss<-function (x, na.omit=FALSE){#na.omit Removes all rows with missing values
if(na.omit)
X<-x[!Is.na(x)]; #Returns non-empty x
m<-mean(x);
n<-length(x);
s<-sd(x);
skew<-sum((x-m)3/s3)/n;
kurt<-sum((x-m)4/s4)/n-3;
return(c(n,m,s,skew,kurt));
} -
mean(salary,trim=0.2) truncated mean, which means 20% of the mean is truncated at both ends of the salary field. Trim ranges from 0 to 0.5
[6] Covariance
cov(x,y) #covariance, the vector dimensions of X and y must be equal
#Calculate variance or covariance matrix (each element of which is the covariance between vectors)
cov(x) #covariance matrix, x is a matrix or data frame
x=matrix(21:40,nrow = 4,ncol = 5,byrow = FALSE) #matrix f=c(1,2,3,10,20,30) rnames=c("R1","R2") cnames=c("C1","C2","C3") my=matrix(f,nrow = 2,ncol = 3,byrow = TRUE,dimnames = list(rnames,cnames)) # C1 C2 C3 R1 1 2 3 R2 10 20 30
array
x=array(data = ,dim = ,dimnames = ) arr1 <- array(1:10) #Vector equivalent to 1 dimension: 1 2 3 4 5 7 8 9 10 arr2 <- array(1:10, dim=c(2,5)) #Matrix equivalent to 2 rows and 5 columns # [,1] [,2] [,3] [,4] [,5] # [1,] 1 3 5 7 9 # [2,] 2 4 6 8 10 arr3 <- array(1:24, dim=c(3,4,2)) #An array of 3 x 4 x 2 dimensions was created # , , 1 # # [,1] [,2] [,3] [,4] # [1,] 1 4 7 10 # [2,] 2 5 8 11 # [3,] 3 6 9 12 # # , , 2 # # [,1] [,2] [,3] [,4] # [1,] 13 16 19 22 # [2,] 14 17 20 23 # [3,] 15 18 21 24 //Equivalent to a "cube", 3 is high or_, 4 is long or_, 2 is wide or "inside" dim1 <- c("A1","A2","A3") dim2 <- c("B1","B2","B3") dim3 <- c("C1","C2") arr4<-array(1:18, dim=c(3,3,2), dimnames = list(dim1, dim2, dim3)) arr4[2,3,1] # Get the value of a single element: 8 arr4[2,1,] #Gets all element values for the second level of the first dimension and the first level of the second dimension arr4[2,,] #Gets all the combined element values at the second level of the first dimension arr4[,2,] #Gets all the combined element values for the second level of the second dimension arr4["A2","B3","C2"] #Get element values by combining horizontal names
-
When a data frame
The attach() function opens to add a data frame to R's search path, and when R encounters a variable,
The search path will look for this variable
detach() off
with function
with(data.frame,{summary(age),mean(weight),plot(age,sys)} Similar to attach(), but only executed at {}, within {} the system automatically finds the variable for data.frame However, the build in {} cannot be saved outside of with (a local variable). To save as a global variable, use <<- with(data.frame, {a < - summary(age), #a, B are global variables b <<- mean(weight), C <- plot(age, sys)} #c is a local variable
factor
Types used to store categories factor(x = character(), levels, labels = levels,exclude = NA, ordered = is.ordered(x), nmax = NA) x is a character type and a numeric vector; levels is the specified factor level; the character type is used to set the unique values that x may contain, and the default value is all the unique values of X. If x is not a character vector, then use as.character(x) to convert x to a character vector and get the level of the X vector.The value of the X vector depends on levels. exclude indicates the level to be excluded; excluded characters nmax: The number of upper bounds of the level, representing the upper bound of the number of factors. For example, the value of factor sex is a vector c('f','m','f','f','m'), and the factor level is c('f','m'): sex <- factor(c('f','m','f','f','m'),levels=c('f','m')) sex [1] f m f f m Levels: f m
- The name of the specified factor level#labels is the name of the specified factor level; it is a horizontal label, a type of character used to label the level,
> x<-c(4, 6 ,4 ,6 ,6 ,6 ,6 ,3 ,1 ,4 ,5, 3 ,1 ,2 ,6, 4 ,5 ,3 ,6, 2) > fact<-factor(x,label=c("A","B","C","D","E","F")) > fact [1] D F D F F F F C A D E C A B F D E C F B Levels: A B C D E F #It is equivalent to renaming the factor horizontally; the character order of the labels parameter should be the same as that of the levels parameter
- Define the factor at the ordered level
> x<-c(4, 6 ,4 ,6 ,6 ,6 ,6 ,3 ,1 ,4 ,5, 3 ,1 ,2 ,6, 4 ,5 ,3 ,6, 2) > fact<-factor(x,label=c("A","B","C","D","E","F"),order=T) #Ordered indicates whether the levels of the factors are ordered, and a logical value is used to specify whether the levels are ordered. > fact [1] D F D F F F F C A D E C A B F D E C F B Levels: A < B < C < D < E < F
sex=factor(c('f','m','f','f','m'),levels=c('f','m'),labels=c('female','male'),ordered=TRUE) > sex [1] female male female female male Levels: female < male
- Looking at the factor level, you can see through the function levels (factors):
levels(heights$gender) #[1] "f" "m" The horizontal series, which corresponds to the length of the level, can be queried by the nlevels function: > nlevels(heights$gender) [1] 2
Normally, the factors are generally out of order, which can be verified by the is.ordered() function:
> is.ordered(sex) [1] FALSE
Converts the factor horizontally to the string as.character(heights$gender)
Converting a factor to a corresponding integer Use the as.numeric() or as.integer() functions to convert a factor to a corresponding integer as.factor(mydata$Category)
Split Continuous Variables into Categories
The cut() function can cut numeric variables into different blocks and then return a factor to group the numeric data: use the cut function to group the numeric data
cut(x, breaks, labels = NULL)
Parameter Notes:
x: Numeric variable
breaks: cutting point vector
Labels: labels for each group
For example, divide the height data into specified cutting point vectors:
cut(heights$height_cm,c(150,170,190)) [1] (150,170] (170,190] (150,170] Levels: (150,170] (170,190]
salary.cut<-cut(salary,breaks=c(1000,2000,3000,max(salary)))
Practice
a <- read_excel("C:/Users/Administrator/Desktop/a.xlsx") # A tibble: 19 x 2 mus_n name <chr> <chr> 1 Fear Xue Zhiqian 2 performer Xue Zhiqian 3 Serious Xue Xue Zhiqian 4 half Xue Zhiqian 5 Accident Xue Zhiqian 6 gentleman Xue Zhiqian 7 regret Xue Zhiqian 8 Xue Zhiqian in Animal World 9 Good night Xue Zhiqian 10 Fear Zhang Yixing 11 Accident Zhang Yixing 12 sheep Zhang Yixing 13 I am not good at Zhang Yixing 14 half Zhang Yixing 15 performer Zhang Yixing 16 Midsummer Hair is not easy 17 I don't have good hair easily 18 get rid of-blues Hair is not easy 19 performer Hair is not easy attach(a) v<-as.data.frame(table(a$mus_n)) p<-list() for (i in v$Var1) {p=append(p,list(name[which(mus_n==i)]))} v$num<-p v detach(a) #Result Var1 Freq num 1 Animal World 1 Xue Zhiqian 2 Fear 2 Xue Zhiqian, Zhang Yixing 3 Serious snow 1 Xue Zhiqian 4 gentleman 1 Xue Zhiqian 5 Midsummer 1 Hair is not easy 6 Good night 1 Xue Zhiqian 7 My bad 2 Zhang Yixing, Hair is not easy 8 get rid of-blues 1 Hair is not easy 9 performer 3 Xue Zhiqian, Zhang Yixing, Hair is not easy 10 sheep 1 Zhang Yixing 11 half 2 Xue Zhiqian, Zhang Yixing 12 regret 1 Xue Zhiqian 13 Accident 2 Xue Zhiqian, Zhang Yixing
pretty(x,n) Create Split Points,take x partition off n Intervals rnorm(50,0,1)
Number of characters in return vector
> x<-c("a","aa","aaaaaddddd") > a<-nchar(x) > a [1] 1 2 10 > a<-nchar(x[c(1,3)]) > a [1] 1 10
> xx<-"aaaaadddddd" > substr(xx,2,5) [1] "aaaa" > substr(xx,2,7) [1] "aaaadd" > substr(xx,2,4)="41" > xx [1] "a41aadddddd"
Separate String Function
> x=strsplit("avg","") > x [[1]] [1] "a" "v" "g" > strsplit("2012-155-544",split = "-") [[1]] [1] "2012" "155" "544"
Link function
> paste("x",1:5,sep = "") [1] "x1" "x2" "x3" "x4" "x5" > paste("x",1:5,sep = "To be frank") [1] "x Frank 1" "x Frank 2" "x Frank 3" "x Frank 4" "x Frank 5" > paste("x","ddd",sep = "To be frank") [1] "x To be frank ddd" year<-c(1959,1959,1959,1959,1960) month<-c(6,7,8,9,10) paste(year,month,sep = "-")
Convert case
> toupper("aaaddf") [1] "AAADDF" > tolower("KDSDDD") [1] "kdsddd"
Enter data manually
> number <- data.frame(age=numeric(0),sxe=character(0),weight=numeric(0)) > number=edit(number) > number age sxe weight 1 12 Macao 11 2 25 Proud 11 3 55 large 5 4 44 large 5
-
Export excel file to a comma-separated csv file.Then the best way to read excel is to use read.table, which is the reading of Excel data
-
There are two simple ways to get data from an Excel spreadsheet.
- Using Clipboard
An easy way to do this is to open a spreadsheet in Excel, select the data area you want, and repeat
Copy to clipboard (using CTRL+C). Then type a command in R
> mydata <- read.delim("clipboard")
- Use package RODBC.
To get the data from Sheet1 in the file "c:\data\body.xls", set it to
Sex Weight Height M 65 168 M 70 172 F 54 156 F 58 163 //You can use commands > library(RODBC) > z <- odbcConnectExcel("c:/data/body.xls") > foo <- sqlFetch(z, "Sheet1") > close(z)
Function transform
Add a new column to the original data frame, change the value of the original variable column, and delete the column variable by assigning NULL
#To add a new column to the data frame
x1=c(1,2,35,4,4) x2=c(7,8,4,5,6) newdata<-transform(newdata,sumy=x1+x2,meany=(x1+x2)/2) > newdata x1 x2 sumy meany 1 1 7 8 4.0 2 2 8 10 5.0 3 35 4 39 19.5 4 4 5 9 4.5 5 4 6 10 5.0 //Or use this method newdata<-data.frame(x1=c(1,2,35,4,4),x2=c(7,8,4,5,6)) names(newdata)[1:2]=c("tan","dd")#Modify Name attach(newdata) newdata$sum<-x1+x2 newdata$mean<-(x1+x2)/2 detach(newdata)
append(1:5, 0:1, after = 3) [1] 1 2 3 0 1 4 5
airquality Dataset with missing values complete.cases(airquality)#Determine if each row has missing values [1] TRUE TRUE TRUE TRUE FALSE FALSE TRUE TRUE [9] TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE [17] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE ............ which(complete.cases(airquality)) > sum(complete.cases(airquality)) [1] 111 na.omit()Delete missing value rows > a=airquality[1:10,] > names(a)=c("x1","x2","x3","x4","x5","x6") > a x1 x2 x3 x4 x5 x6 1 41 190 7.4 67 5 1 2 36 118 8.0 72 5 2 3 12 149 12.6 74 5 3 4 18 313 11.5 62 5 4 5 NA NA 14.3 56 5 5 6 28 NA 14.9 66 5 6 7 23 299 8.6 65 5 7 8 19 99 13.8 59 5 8 9 8 19 20.1 61 5 9 10 NA 194 8.6 69 5 10 > a=na.omit(a)#Remove all rows with missing values > a x1 x2 x3 x4 x5 x6 1 41 190 7.4 67 5 1 2 36 118 8.0 72 5 2 3 12 149 12.6 74 5 3 4 18 313 11.5 62 5 4 7 23 299 8.6 65 5 7 8 19 99 13.8 59 5 8 9 8 19 20.1 61 5 9 a[order(a$x4),]#sort #Connect two data frames merge(q,newdata)#Connections like databases xx=rbind(a[1:5,],newdata)Line Link, Horizontal Link cbind(q,newdata)Column Link, Vertical Link
subset() extracts a subset that meets the criteria
> subset(Puromycin, state == "treated" & rate > 160) subset(Puromycin, conc > mean(conc))
A[c(-1,-3)] Exclude one or more columns you don't want
barplot(table(x))
fix() not only modifies tables, but also functions
Chapter IV
t-test
The test efficiency is 1-beta, which is the power
d is
> pwr.t.test(d=0.2678571428571429,sig.level = 0.05,power = 0.9,type = "one.sample",alternative = "two.sided") One-sample t test power calculation #Result n = 148.3841#To check efficiency is 90%.A sample size of 148.3841 is required d = 0.2678571 sig.level = 0.05 power = 0.9 alternative = two.sided
> pwr.t.test(n=100,d=0.2678571428571429,sig.level = 0.05,type = "one.sample",alternative = "two.sided") One-sample t test power calculation #Result n = 100 d = 0.2678571 sig.level = 0.05 power = 0.7557033#Fixed sample size of 100, only 75.57% of the grasp alternative = two.sided
pwr.2p.test() ratio and overall pairing design ratio
pwr.2p2n.test()
pwr.anova.test()
pwr.chisq.test() chi square
pwr.f2.test() linear model
pwr.t2n.test()
#These are test performance analyses that help determine the sample size needed given confidence and effect values.
#is the estimate of the sample size, or the probability of detecting the required effect value for a given sample size, an estimate of the degree of certainty
Chapter V
i=1 while (i<11) { print("d") i=i+1 if (i>10) { break } }
break() can also be used in for statements
sweep(x, MARGIN, STATS, FUN="-", ...) Operate on the matrix.The MARGIN is 1, which represents the direction of the row. 2 represents the direction of the column.STATS is a parameter of the operation.FUN is an operation function and subtraction is the default. sweep(x,1,1:5, "*") multiplies each element of vector 1:5 by each line
apply
apply(X, MARGIN, FUN) Here: -x: an array or matrix -MARGIN: Two values 1 or 2 determine which dimension to function on -MARGIN=1`: Operation based on rows -MARGIN=2`: Operation is column based -MARGIN=c(1,2)`: Operate on both rows and columns -FUN: Which operation to use, The built-in functions are mean, medium, sum, min, max And, of course, a wide range of user-defined functions
Ifelse (condition if Execution 1, otherwise Execution 2)
q=c("ff","aa") for (i in q) { print(switch (i,aa = "1",ss="2",ff="3")) }
Percentile ratio using quantile() function
> data <- c(1,2,3,4,5,6,7,8,9,10) > quantile(data,0.5) 50% 5.5 > quantile(data,c(0.25,0.75)) 25% 75% 3.25 7.75
Character separation strsplit(aa$name, "")
Transpose t()
The aggregate function is a commonly used function in data processing.
Simply put, [group by] in sql language can group and aggregate data as required, then add up and average the aggregated data.
> x=data.frame(name=c("Zhang San","Li Si","King Five","Zhao Six"),sex=c("M","M","F","F"),age=c(20,40,22,30),height=c(166,170,150,155)) > x name sex age height 1 Zhang San M 20 166 2 Li Si M 40 170 3 King Five F 22 150 4 Zhao Six F 30 155
aggregate(x[,3:4],by=list(sex=x$sex),FUN=mean) #sex age height 1 F 26 152.5 mean value 2 M 30 168.0 > > list(sex=x$sex) $sex [1] M M F F Levels: F M
summary() descriptive statistics sd(1,4) apply : Used to traverse rows or columns in an array and process their elements using a specified function. lapply : Traverses through each element in the list vector and uses the specified function to process its elements.Returns the list vector. sapply : and lapply It is basically the same, except that the returned result is simplified and normal vectors are returned. mapply: Supports passing in more than two lists. tapply: Access parameters INDEX,To group data, sum SQL In by group Same.
#Define Functions fun1<-function(x){ sum1<-sum(x) aa=sum1/length(x) return(aa) }
Chapter VI
(1)
1. Descriptive statistical summary
summary(iris$Sepal.Length) # Min. 1st Qu. Median Mean 3rd Qu. Max. 4.300 5.100 5.800 5.843 6.400 7.900
2.Hmisc package describe()
describe(iris$Sepal.Length) #iris$Sepal.Length n missing distinct Info Mean Gmd #Basic Descriptive Statistics 150 0 35 0.998 5.843 0.9462 .05 .10 .25 .50 .75 .90 .95 #Quantile 4.600 4.800 5.100 5.800 6.400 6.900 7.255 lowest : 4.3 4.4 4.5 4.6 4.7, highest: 7.3 7.4 7.6 7.7 7.9
3. describe() in psych package
describe(iris$Sepal.Length) # vars n mean sd median trimmed mad min max range skew kurtosis se X1 1 150 5.84 0.83 5.8 5.81 1.04 4.3 7.9 3.6 0.31 -0.61 0.07
4. stat.desc() in the pastecs package
stat. desc (x, basic=TRUE, desc=TRUE, norm-FALSE,p=0.95) Where x is the data frame.basic=TRUE (default) Calculate the number of all values, null values, missing values, and the minimum, maximum, range, and sum.desc=TRUE (default), calculated median, mean, mean standard error, mean 95% Confidence interval, variance, standard deviation, and coefficient of variation.nom-TRUE (non-default), calculates the normal distribution statistics, including skewness and kurtosis (and their statistical significance) and the results of the Shapiro-Wilk normal test. stat.desc(iris$Sepal.Length, basic=TRUE, desc=TRUE, norm=FALSE,p=0.95) # nbr.val nbr.null nbr.na min max range sum median mean 150.00000000 0.00000000 0.00000000 4.30000000 7.90000000 3.60000000 876.50000000 5.80000000 5.84333333 SE.mean CI.mean.0.95 var 0.06761132 0.13360085 0.68569351 std.dev coef.var 0.82806613 0.14171126
(2) Grouped calculation of descriptive statistics
1.aggregate() attach(iris) aggregate(Sepal.Length,by=list(Species),FUN=mean) # Group.1 x 1 setosa 5.006 2 versicolor 5.936 3 virginica 6.588
2.by(),This function is similar to the one written above. //It's just that the by function can return multiple statistics, while aggregae can only return one attach(iris) ss=function(x)(c(n=length(x),m=mean(x),s=sd(x))) by(Sepal.Length,Species,ss) # Species: setosa n m s 50.0000000 5.0060000 0.3524897 -------------------------------------- Species: versicolor n m s 50.0000000 5.9360000 0.5161711 -------------------------------------- Species: virginica n m s 50.0000000 6.5880000 0.6358796
3.psych In Package describeBy() describeBy(Sepal.Length,Species) # Descriptive statistics by group group: setosa vars n mean sd median trimmed mad min max range skew kurtosis se X1 1 50 5.01 0.35 5 5 0.3 4.3 5.8 1.5 0.11 -0.45 0.05 -------------------------------------- group: versicolor vars n mean sd median trimmed mad min max range skew kurtosis se X1 1 50 5.94 0.52 5.9 5.94 0.52 4.9 7 2.1 0.1 -0.69 0.07 -------------------------------------- group: virginica vars n mean sd median trimmed mad min max range skew kurtosis se X1 1 50 6.59 0.64 6.5 6.57 0.59 4.9 7.9 3 0.11 -0.2 0.09
4.doBy Grouping Descriptive Statistics in Packages summaryBy(Sepal.Length~Species,data = iris,FUN = mean) Species Sepal.Length.mean 1 setosa 5.006 2 versicolor 5.936 3 virginica 6.588
Chapter VII Hypothesis Test (t Test)
1. Single-sample t-test
t.test(iris$Sepal.Length,mu = 4.5,alternative = "two.sided",conf.level = 0.95) Where mu is the population mean This is for the original data, if there is no original data, then use the formula to hard calculate Use if double-sided test t=(x-u)/(s/sqrt(n)) v=n-1 2*pt(-abs(t),v)
Single test does not multiply 2
2. Two-sample t-test
t.test(x~group,var.equal=TRUE,alternative="two.sided",conf.level=0.95) Homogeneity test of variance (F test) var.test(x,y,ratio = 1,alternative = "two.sided",conf.level = 0.95) #ratio = 1 is the ratio of variance between two That is, the original assumption that two variances are equal
Chapter VIII
Variance analysis can be done with both aov() and lm functions
Interaction means that A and B interact together to influence the dependent variable.
y~A+B+A:B represents the interaction of A, B, and A with B to predict y y~A*B*C represents all possible interactions and is equivalent to y~A+B+A:B+A:C+B:C+A:B:C Represents all variables except the dependent variable, such as y~. Equivalent to y~A+B+C y~(A+B+C)^2 is equivalent to y~A+B+A:B+A:C+B:C
Following are some commonly used research design expressions, with lowercase letters as quantitative variables, uppercase letters as group factors, and Subject as loose variables:
Univariate ANOVA, y-A.
_Univariate ANCOVA, y-x+A with a single covariate.
.Two-factor ANOVA.yAB.
_Bivariate ANOVA, y-x1+X2+A+B, with two covariates.
Randomized block, y-B+A. of which 8 were block factors.
1. ANOVA.y-A+Ermor (Subjec/A) in Sister One Factor.
Repeated measurements of ANOVA, y-B w+Eror (Subjectwl) with a single intra-group factor (w) and a single inter-group factor (B) will have an impact on the simulation results in the order of the effects in the expression in the event of a non-contest design or in the presence of covariates.Love is far away, in the analysis of not a thousand ideas, expectation, "Japanese rules - the letter of the wind is endless.R I know Class 1 (Ordinary expensive) side!
To calculate the ANOVA effect, the results of the ANOVA table in R for model-A+B+A:B will be evaluated.
The effect of A on y;
/ The effect of B on y when finding A.
The interaction between A and B when controlling the main effects of A and B.
If the data is modeled using the following expression; y~A+B+A:B has three types of methods to decompose the variance explained by the effects on the right for y.
aa=read.table("C/a.txt",header = TRUE,sep = "|") library("multcomp") attach(aa) table(gruop) # gruop # A B C # 10 10 10 aggregate(apt,by=list(gruop),FUN=mean) aggregate(apt,by=list(gruop),FUN=sd) fit=aov(apt~gruop) Use here lm()Is the same # # Call: # aov(formula = apt ~ gruop) # # Terms: # gruop Residuals # Sum of Squares 240.20499 71.77491 # Deg. of Freedom 2 27 # # Residual standard error: 1.630439 # Estimated effects may be unbalanced summary(fit) # # Df Sum Sq Mean Sq F value Pr(>F) gruop 2 240.20 120.10 45.18 2.43e-09 *** Here P Value is much less than 0.05,Reject the original hypothesis, statistically significant, three groups of mean are not equal # Residuals 27 71.77 2.66 # --- # Signif. codes: # 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 library("gplots") plotmeans(apt~gruop,xlab = "gruop",ylab = "apt",main="mean plot \nwith 95% CI") TukeyHSD(fit) # # Tukey multiple comparisons of means # 95% family-wise confidence level # # Fit: aov(formula = apt ~ gruop) # # $gruop # diff lwr upr p adj # B-A 6.514 4.7061219 8.321878 0.0000000 # C-A 1.206 -0.6018781 3.013878 0.2411216 # C-B -5.308 -7.1158781 -3.500122 0.0000002
Later variance analysis is more difficult, such as variance analysis of Latin square design data and variance analysis of factorial design data and orthogonal design data.
Analysis of variance for repeated measurements
But all are the same routine
Examp1el_6<-read,table("example8-8,csv",header=TRUE,sep-",") attach(Example8_6) block <-factor(block,order=FALSE) dose <-factor(dose,ordex=FALSE) table(block,dose) aggregate(weight,by-list(block),FUN=mean) aggregate(weight,by-list(bloek),FUN-sd) aqqregate(welight,by-1ist(dose),FUN-mean) aggregate(uoight,by=list(dosel,FUN=sd) fit <-aov (weight-block+dose) summary(fit) TakeyHsD(f1t,"dose") TakeyHsD(fit,"block") detach(Example8_6)
Covariance analysis
Eliminate the interference and influence of non-processing factors to minimize the estimation of experimental error
Non-processing factors are those that are either uncontrollable or difficult to control
Covariance analysis combines the effects of regression analysis and variance analysis, both for regression analysis and for variance analysis
Call the uncontrollable factor X covariate
Y is the dependent variable
library("multcomp") # > head(litter) dose weight gesttime number Analysis is different here dose Effect on weight, duration of pregnancy as covariate 1 0 28.05 22.5 15 2 0 33.33 22.5 14 3 0 36.37 22.0 14 4 0 35.52 22.0 13 5 0 36.77 21.5 15 6 0 29.60 23.0 5 attach(litter) table(dose) # dose 0 5 50 500 20 19 18 17 aggregate(weight,by=list(dose),FUN=mean) aggregate(weight,by=list(dose),FUN=sd) fit=aov(weight~gesttime+dose) summary(fit) TukeyHSD(fit,"dose") detach(litter) (1) Group.1 x 1 0 32.30850 2 5 29.30842 3 50 29.86611 4 500 29.64647 (2) Group.1 x 1 0 2.695119 2 5 5.092352 3 50 3.762529 4 500 5.404372 (3) Df Sum Sq Mean Sq F value Pr(>F) gesttime 1 134.3 134.30 8.049 0.00597 ** Explains that pregnancy time is related to weight, and corrects for the effect of pregnancy time dose 3 137.1 45.71 2.739 0.04988 * Explain dose Significant effect on weight difference Residuals 69 1151.3 16.69 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 (4) Tukey multiple comparisons of means 95% family-wise confidence level Fit: aov(formula = weight ~ gesttime + dose) $dose diff lwr upr p adj 5-0 -3.4171829 -6.862400 0.0280344 0.0526932 accept 50-0 -1.8696021 -5.363552 1.6243477 0.4982585 accept 500-0 -2.9743214 -6.521945 0.5733020 0.1314349 accept 50-5 1.5475808 -1.989654 5.0848159 0.6589068 accept 500-5 0.4428615 -3.147400 4.0331232 0.9880653 accept 500-50 -1.1047193 -4.741771 2.5323320 0.8543245 accept #
Chapter IX Linear Regression and Correlation
(1) Linear correlation analysis
(1) Coefficient of correlation
cor(x = ,use =,method = ) use Include all.obs Assuming there are no missing values, error if missing values are encountered everything When missing data is encountered, the correlation coefficient will be set to missing (Default) complete.obs()Row Delete pairwise.complete.obs Delete in pairs method Include pearson spearman Rank correlation kendall
(2) Correlation coefficient test
cor.test(x = ,y= ,alternative = ,method = )
(3) Scatter plot
plot(x,y) //Example cor.test(cars$speed,cars$dist,alternative = "two.sided",method = "pearson") plot(cars$speed,cars$dist) //Result # Pearson's product-moment correlation # # data: cars$speed and cars$dist # t = 9.464, df = 48, p-value = 1.49e-12 # alternative hypothesis: true correlation is not equal to 0 # 95 percent confidence interval: # 0.6816422 0.8862036 # sample estimates: # cor # 0.8068949 cor.test(cars$speed,cars$dist,alternative = "two.sided",method = "spearman") # Spearman's rank correlation rho # # data: cars$speed and cars$dist # S = 3532.8, p-value = 8.825e-14 # alternative hypothesis: true rho is not equal to 0 # sample estimates: # rho # 0.8303568
2. Linear Regression Analysis
Attention to outliers
lm()
y~A+B+A:B represents the interaction of A, B, and A with B to predict y y~A*B*c denotes all possible interactions and is equivalent to y~A+B+A:B+A:C+B:C+A:B:C Represents all variables except the dependent variable, such as y~. Equivalent to y~A+B+C y~(A+B+C)^2 is equivalent to y~A+B+c+A:B+A:C+B:C -Indicates the removal of an example y~(A+B+C)^2-A:B is equivalent to y~A+B+C+A:C+B:C -1 Force intercept removal I() Operates within brackets as a new variable, interpreting the elements within brackets from an arithmetic perspective. For example, y~x+(z+w)^2 expands to y~x+z+w+z:w Whereas y~x+I((z+w)^2) is expanded to y~x+h as a new system equivalent variable Function log(Y)~x+z+w
- Function|purpose
summary) | Show detailed results of the fitted model coeficients) | Lists the model parameters (intercept and slope) of the fitted model confint()| Provides the confidence interval for model parameters (default is 95%) fited ()| Lists the predictions of the fitted model residulas ()| Lists the residual values of the fitted model anova ()| Generate an analysis of variance table that fits the model, or compare two or more; vcov)|List covariance matrices for model parameters AIC()|Output Chichi Information Statistics plot() generates a diagnostic diagram for an evaluation fit model predict() Predicts dependent variable values for new datasets using a fitting model
test6.2 <- data.frame( x1=c(0.4,0.4,3.1,0.6,4.7,1.7,9.4,10.1,11.6,12.6,10.9,23.1,23.1,21.6,23.1,1.9,26.8,29.9), x2=c(52,23,19,34,24,65,44,31,29,58,37,46,50,44,56,36,58,51), x3=c(158,163,37,157,59,123,46,117,173,112,111,114,134,73,168,143,202,124), y=c(64,60,71,61,54,77,81,93,93,51,76,96,77,93,95,54,168,99)) lm.sol <- lm(y~1+x1+x2+x3, data=test6.2) summary(lm.sol) lm.new=lm(y~x1+x2+x3+I(x1^2)+I(x1*x2)+I(x1*x3)+I(x2^2)+I(x2*x3)+I(x3^2)+I(x1*x2*x3),data=test6.2) lm.temp=lm(y~x1+x2+I(x1^2)+I(x1*x2)+I(x1*x3)+I(x2*x3)+I(x3^2)+I(x1*x2*x3)+0,data=test6.2)
(3) Multiple linear regression
> s=lm(Fertility~.,data=swiss) //Here the dataset is swiss, Fertility~. for Fertility as the dependent variable, ~for regression, .Represents a dataset other than Fertility All variables except as independent variables > print(s) # # Call: # lm(formula = Fertility ~ ., data = swiss) # # Coefficients: # (Intercept) Agriculture Examination # 66.9152 -0.1721 -0.2580 # Education Catholic Infant.Mortality # -0.8709 0.1041 1.0770 > summary(s)Model Summary Information # # Call: # lm(formula = Fertility ~ ., data = swiss) # # Residuals: # Min 1Q Median 3Q Max # -15.2743 -5.2617 0.5032 4.1198 15.3213 # # Coefficients: # Estimate Std. Error t value Pr(>|t|) # (Intercept) 66.91518 10.70604 6.250 1.91e-07 *** # Agriculture -0.17211 0.07030 -2.448 0.01873 * # Examination -0.25801 0.25388 -1.016 0.31546 # Education -0.87094 0.18303 -4.758 2.43e-05 *** # Catholic 0.10412 0.03526 2.953 0.00519 ** # Infant.Mortality 1.07705 0.38172 2.822 0.00734 ** # --- # Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 # # Residual standard error: 7.165 on 41 degrees of freedom # Multiple R-squared: 0.7067, Adjusted R-squared: 0.671 # F-statistic: 19.76 on 5 and 41 DF, p-value: 5.594e-10 # # # > s1=step (s, direction="forward") stepwise regression, backward culling # Start: AIC=190.69 # Fertility ~ Agriculture + Examination + Education + Catholic + # Infant.Mortality
abline(lm(carsspeed carsspeed~carsspeed carsdist, col="red")#Add a fitting curve to the scatterplot
Multivariate Linear Regression Analysis
β=(X'X)^-1*X'Y
Multicollinearity
(1) An intuitive method of judgment.
1. In the correlation coefficient matrix of independent variables, the correlation coefficient values of some independent variables are relatively large.
2. The sign of regression coefficient is contrary to professional knowledge or general experience.
3. The results of t-test for the regression coefficients of important independent variables are not significant, but F-test has passed significantly.
4. If you add a variable or delete a variable, the estimates of the regression coefficients will change dramatically.
5. The confidence interval of regression coefficients for important variables is significantly too large.
(2) Variance Expansion Factor (VIF).
(3) eigenvalue discriminant method
If the qq plot satisfies the positive-too hypothesis, the point should fall on a 45-degree straight line.
fited ()| Lists the predictions of the fitted model
[Procedure]
library("MASS") attach(iris) fit1=lm(Sepal.Length~Sepal.Width+Petal.Length+Petal.Width) fit2=lm(Sepal.Length~1) stepAIC(fit2,direction = "both",scope = list(upper=fit1,lower=fit2))#Stepwise regression anova(fit1) summary(fit1) Sepal.Length fitted(fit1) residuals(fit1) detach(iris) [Results) Start: AIC=-55.6 Sepal.Length ~ 1 #Start with an intercept Df Sum of Sq RSS AIC + Petal.Length 1 77.643 24.525 -267.641 #-267.641 Minimum + Petal.Width 1 68.353 33.815 -219.460 + Sepal.Width 1 1.412 100.756 -55.690 <none> 102.168 -55.602 #<none>means doing nothing Step: AIC=-267.64 Sepal.Length ~ Petal.Length #So add the smallest AIC Df Sum of Sq RSS AIC + Sepal.Width 1 8.196 16.329 -326.66 #Minimum + Petal.Width 1 0.644 23.881 -269.63 <none> 24.525 -267.64 - Petal.Length 1 77.643 102.168 -55.60 #First add + Sepal.Width on the basis of Epal.Length ~ Petal.Length #Removing Petal.Length Step: AIC=-326.66 Sepal.Length ~ Petal.Length + Sepal.Width #So they both fit Df Sum of Sq RSS AIC + Petal.Width 1 1.883 14.445 -343.04 #Minimum <none> 16.329 -326.66 - Sepal.Width 1 8.196 24.525 -267.64 - Petal.Length 1 84.427 100.756 -55.69 Step: AIC=-343.04 Sepal.Length ~ Petal.Length + Sepal.Width + Petal.Width #Final Df Sum of Sq RSS AIC <none> 14.445 -343.04 - Petal.Width 1 1.8834 16.329 -326.66 - Sepal.Width 1 9.4353 23.881 -269.63 - Petal.Length 1 15.4657 29.911 -235.86 Call: lm(formula = Sepal.Length ~ Petal.Length + Sepal.Width + Petal.Width) #So all three are preserved Coefficients: (Intercept) Petal.Length Sepal.Width Petal.Width 1.8560 0.7091 0.6508 -0.5565 > anova(fit1) Analysis of Variance Table Response: Sepal.Length Df Sum Sq Mean Sq F value Pr(>F) Sepal.Width 1 1.412 1.412 14.274 0.0002296 *** #Significant, good fit Petal.Length 1 84.427 84.427 853.309 < 2.2e-16 *** Petal.Width 1 1.883 1.883 19.035 2.413e-05 *** Residuals 146 14.445 0.099 > summary(fit1) Call: lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length + Petal.Width) Residuals: Min 1Q Median 3Q Max -0.82816 -0.21989 0.01875 0.19709 0.84570 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.85600 0.25078 7.401 9.85e-12 *** Sepal.Width 0.65084 0.06665 9.765 < 2e-16 *** Petal.Length 0.70913 0.05672 12.502 < 2e-16 *** Petal.Width -0.55648 0.12755 -4.363 2.41e-05 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 0.3145 on 146 degrees of freedom Multiple R-squared: 0.8586, Adjusted R-squared: 0.8557 #R Square 0.8586 works well F-statistic: 295.5 on 3 and 146 DF, p-value: < 2.2e-16 //There are three more results, the actual values, the fitted values, and the residuals
Complex and partial correlation coefficients
library("ggm")
pcor(u,s)
u is a numerical vector. The first two values indicate the subscript of the variable for which the correlation coefficient is to be calculated. The remaining values are the subscript of the conditional variable.
Covariance matrix with s as variable
attach(iris) cor(Example) library(psych) corr.test(Example) pcor(c(1,5,2,3,4),cov(Example)) #Partial correlation coefficient pcor.test(pcor(c(1,5,2,3,4),cov(Example)),3,n=13) #Partial correlation coefficient statistical test pcor(c(2,5,1,3,4),cov(Example)) pcor.test(pcor(c(1,5,2,3,4),cov(Example)),3,n=13) pcor(c(3,5,1,2,4),cov(Example)) pcor.test(pcor(c(1,5,2,3,4),cov(Example)),3,n=13) pcor(c(4,5,1,2,3),cov(Example)) pcor.test(pcor(c(1,5,2,3,4),cov(Example)),3,n=13)
Chapter 11 Logical Regression
The dependent variable is not a continuous normal variable, but a binary variable, y=0 or 1
1. Unconditional Logistic Regression (cohort study or unpaired case, control study group data analysis)
The probability that y takes a value P is related to x, obviously if P=a+bx1+cx2...Is not appropriate because P is 0-1
So log transform
log(P)=a+bx1+cx2...... (1)
P(y=1|x)=exp(a+bx)/1+exp(a+bx) (2)
So ln(P/1-P)=a+bx1+cx2...
Chapter 22 Statistical Charts
hist(x) Draw Histogram plot(x1,x2)Scatter plot plot(q$x5,q$x6,main="Scatter plot",xlab="mathematical analysis",ylab="Higher algebra",xlim=c(0,100),ylim=c(0,100),col="red",pch=19) ##Where pch=19 refers to the point in the graph and what shape to use a=c(1:6) b=c(2,4,6,8,10,12) plot(a,b,type="l") #Draw a Connected Diagram table(x1) x1 79 80 82 85 88 90 91 92 93 94 95 96 97 98 99 100 1 1 1 1 4 1 2 1 7 3 4 1 7 4 11 51 barplot(table(x1)) #Draw bars, columns pie(table(x1)) #Pie chart boxplot(x1,x2)#Box Diagram boxplot(x1,x2,horizontal=T) #Horizontal placement faces(x1) #Face graph stem(x1) #Stem-and-leaf display The decimal point is at the | 78 | 0 80 | 0 82 | 0 84 | 0 86 | 000 88 | 0000 90 | 000 92 | 00000000 94 | 0000000 96 | 00000000 98 | 000000000000000 100| 000000000000000000000000000000000000000000000000000 qqnorm(x1) Q-Q chart qqline(x1) plot(density(rnorm(1000)))#Distribution Function Density Map
data set
data() yields many built-in datasets
Common iris
heatmap(as.matrix(mtcars),Rowv=NA,Colv=NA,col=heat.colors(256),scale="column",margins=c(2,8),main="Car haracteristics by Model") //Thermogram sunflowerplot(iris[,3:4],col="gold",seg.col="gold") pairs(iris[,1:4])Scatter Atlas This example produces 16 graphs, 1 and 1,1 And 2,1 And 3...,4 And 1,4 And 2...Is there a connection between variables par(mfrow=c(3,1)) ##par divides the canvas into three rows and one column, and more plot(x1,x2) scatterplot3d(x1,x2)Draw 3-D Scatter Chart. par(mfrow=c(3,1)) par(bg="red")#Color the entire background plot(rnorm(1000),col=5) #Set only the colors in the axes
plot(rnorm(1000),type="n") x<-par("usr")#Get the coordinates of the four corners of the coordinate axis rect(x[1],x[3],x[2],x[4],col="lightgray")#Set to Bright Gray points(rnorm(1000))Drawing
Using the title() function, set the title, or the
title("Sales Figures for 2010",col.main="blue") title(xlab="X axis",col.lab="red") title(ylab="Y axis",col.lab="blue") rain<-read.csv("cityrain.csv") rain <- read.csv("cityrain.csv") par(mfrow=c(4,1),mar=c(5,7,4,2),omi=c(0.2,2,0.2,2)) for(i in 2:5) { plot(rain[,i],ann=FALSE,axes=FALSE,type="l",col="gray",lwd=2) mtext(side=2,at=mean(rain[,i]),names(rain[i]),las=2,col="black") mtext(side=4,at=mean(rain[,i]),mean(rain[i]),las=2,col="black") points(which.min(rain[,i]),min(rain[,i]),pch=19,col="blue") points(which.max(rain[,i]),max(rain[,i]),pch=19,col="red") }
Set Times New Roman font with par(family="serif", font=2)
0 Default, 1 Bold, 2 Italic, 3 Bold Italic
> plot(rain$Tokyo,type="l",col="red", + ylim=c(0,300), + main="Monthly Rainfall in major cities", + xlab="Month of Year", + ylab="Rainfall (mm)", + lwd=2) > lines(rain$NewYork,type="l",col="blue",lwd=2) > lines(rain$London,type="l",col="green",lwd=2) > lines(rain$Berlin,type="l",col="orange",lwd=2)
par(bty="u") Here BTY changes the style of the coordinate axis
plot(x,col='RED',type="l",lty=6,lwd=3)
Add a black box to the edge of the graph
par(oma=c(1,1,1,1))
plot(x,col='RED',type="l",lty=6,lwd=3)
box(which="figure")
Plot(rnorm(1000), col=5, pch=14, cex=1)#pch corresponds to the shape of the scatterplot point, CEX represents the size of the point
plot(x,col='RED', type='l', lty=6, lwd=3)#lwd is line width
plot(rnorm(100),xaxp=c(0,100,10),yaxp=c(0,5,1))#Change the scale of the coordinate axis
plot(rnorm(100),main="Plot Title", col.axis="blue", col.lab="red", col.main="darkblue")#Set title to blue, coordinate title to red, coordinate horizontal scale to blue
#Ity parameter: determines the line type
0: blank
1: solid(default)
2: dasked
3: dotted
4: dotdash
5: 1ongdash
6: twodash
Column Chart
setwd("D:\\R Language Learning") #install.packages("RColorBrewer") #if not already installed library(RColorBrewer) citysales<-read.csv("citysales.csv") barplot(as.matrix(citysales[,2:4]), beside=T, #When beside is TRUE, it represents a column chart, when F (or not written by default), it represents a stacked chart legend.text=citysales$City, args.legend=list(bty="n",horiz=TRUE), col=brewer.pal(5,"Set1"), #Paintboard, 5 Medium Colors border="white", ylim=c(0,100), ylab="Sales Revenue (1,000's of USD)", main="Sales Figures") box(bty="l") #legend.text=citysales$City for Legend args.legend=list(bty="n",horiz=TRUE)#Horizontal placement box(bty="l")#Indicates that the coordinate axis only has x and y, so if you change the style of the coordinate axis for bty here legend("topright", #Legend position is upper right corner + legend=c("Tokyo","NewYork","London","Berlin"), #Legend Content + col=c("red","blue","green","orange"), #Legend color + lty=1,lwd=2) #Large Legend
stack effect
citysales<-read.csv("citysales.csv") barplot(as.matrix(citysales[,2:4]), legend.text=citysales$City, args.legend=list(bty="n",horiz=TRUE),#Horizontal placement col=brewer.pal(5,"Set1"), border="white", ylim=c(0,200), ylab="Sales Revenue(1,000's of USD)", main="Sales Figures") box(bty="l")
Show percentage with stacking effect
Use Chapter V Data
citysalesperc<-read.csv("citysalesperc.csv")
par(mar=c(5,4,4,8),xpd=T)###mar=c(5,4,4,8) represents the edge width of the graph, xpd=the bounds of the T-plot
barplot(as.matrix(citysalesperc[,2:4]), col=brewer.pal(5,"Set1"), border="white", ylab="Sales Revenue (1,000's of USD)", main="Percentage Sales Figures") legend("right",legend=citysalesperc$City,bty="n",###Legend legend, where bty="n" means there is no box around the axis inset=c(-0.3,0), fill=brewer.pal(5,"Set1"))##Legend and Edge Distance box(bty="l")
Column chart in horizontal direction
barplot(as.matrix(citysales[,2:4]), beside=TRUE, horiz=TRUE, legend.text=citysales$City, args.legend=list(bty="n"), #Where bty="n" means there is no box around the axis col=brewer.pal(5,"Set1"), border="white", xlim=c(0,100), xlab="Sales Revenue(1,000's of USD)", main="Sales Figures") box(bty="l")
Stacked horizontal column showing percentages
par(mar=c(5,4,4,8),xpd=T) barplot(as.matrix(citysalesperc[,2:4]), horiz=TRUE, col=brewer.pal(5,"Set1"), border="white", xlab="Percentage of Sales", main="Perecentage Sales Figures") legend("right",legend=citysalesperc$City,bty="n", #Where bty="n" means there is no box around the axis inset=c(-0.3,0),fill=brewer.pal(5,"Set1")) box(bty="l")
Adjust the width, spacing and color of the column chart
barplot(as.matrix(citysales[,2:4]), beside=TRUE, legend.text=citysales$City, args.legend=list(bty="n",horiz=T), col=c("#E5562A","#491A5B","#8C6CA8","#BD1B8A","#7CB6E4"), border=FALSE, space=c(0,5), ##space=c(0,5) represents the distance between each column, 0 represents the distance within the group of columns, and 5 represents the distance between groups. ylim=c(0,100), ylab="Sales Revenue(1,000's of USD)", main="Sales Figures") box(bty="l")
barplot(as.matrix(citysales[,2:4]), beside=T, legend.text=citysales$City, args.legend=list(bty="n",horiz=T), #Where bty="n" means there is no box around the axis ylim=c(0,100),ylab="SalesRevenue (1,000's of USD)", main="Sales Figures") box(bty="l")
Display data at the top of the column
x<-barplot(as.matrix(citysales[,2:4]), beside=TRUE, legend.text=citysales$City, args.legend=list(bty="n",horiz=TRUE), #Where bty="n" means there is no box around the axis col=brewer.pal(5,"Set1"), border="gray",#m Border of each column ylim=c(0,100),ylab="Sales Revenue (1,000'sof USD)", main="Sales Figures") length("top") y<-as.matrix(citysales[,2:4]) text(x,y+2,labels=as.character(y))##y+2 plus one more 2 box(bty="l")
Label data beside horizontal columns
y<-barplot(as.matrix(citysales[,2:4]), beside=TRUE,horiz=TRUE, legend.text=citysales$City, args.legend=list(bty="n"), col=brewer.pal(5,"Set1"), border="white", xlim=c(0,100),xlab="Sales Revenue(1,000's of USD)", main="Sales Figures") x<-as.matrix(citysales[,2:4]) text(x+2,y,labels=as.character(x))
Label inside a column
rain<-read.csv("cityrain.csv") y<-barplot(as.matrix(rain[1,-1]), horiz=T, col="white", yaxt="n",main=" Rainfall in January",###yaxt="n" means no y-axis is displayed xlab="Rainfall(mm)") x<-0.5*rain[1,-1] text(x,y,colnames(rain[-1])) #box(bty="l")
Labeling error
sales<-t(as.matrix(citysales[,-1])) colnames(sales)<-citysales[,1] x<-barplot(sales, beside=T, legend.text=rownames(sales), args.legend=list(bty="n",horiz=F), col=brewer.pal(3,"Set2"), border="white", ylim=c(0,100), ylab="Sales Revenue (1,000's of USD)", main="Sales Figures") arrows(x0=x, y0=sales*0.95, x1=x, y1=sales*1.05, angle=90, code=3, length=0.04, lwd=0.4) legend("top") box(bty="l")
s
ales<-t(as.matrix(citysales[,-1])) colnames(sales)<-citysales[,1] x<-barplot(sales, beside=T, legend.text=rownames(sales), args.legend=list(bty="n",horiz=T), col=brewer.pal(3,"Set2"), border="white", ylim=c(0,100), ylab="Sales Revenue (1,000's of USD)", main="Sales Figures") arrows(x0=x, y0=sales*0.95, x1=x, y1=sales*1.05, angle=90, code=3, length=0.04, lwd=0.4) legend("topright",cex=0.6) box(bty="l")
plot(x) plots in ordinate and ordinal coordinates with element values of X
Binary mapping of plot (x, y) x (on the x-axis) and Y (on the y-axis)
sunflowerplot(x, y) is the same, but with points of similar coordinates as flowers, the number of petals is the number of points
pie(x) pie chart
Box plot(x) box plot ("box-and-whiskers")
stripchart(x) draws the value of X on a line segment and can be used as a substitute for a box chart when the sample size is small
coplot(x~y | z) Draws a binary graph of X and y for each value (or interval) of Z
interaction.plot (f1, f2, y) If F1 and F2 are factors, plot the mean of y, using the different values of F1 as the x-axis, while F2 does not
The same value corresponds to different curves; other statistics for y can be specified with the option fun (default calculated mean, fun=mean)
matplot(x,y) binary graph, where the first column of X corresponds to the first column of y, the second column of X corresponds to the second column of y, and so on.
dotchart(x) If x is a data frame, make a Cleveland point chart (row by row, column by column)
fourfoldplot(x) uses four quarter circles to display the 2times2 column connection (x must be an array of dim=c(2,2,k), or a matrix of dim=c(2,2), if k?1)
assocplot(x) Cohen-Friendly graph showing the degree to which row and column variables in a two-dimensional contingent table deviate from independence
Mosaic diagram of logarithmic linear regression residuals for mosaicplot(x) contiguous tables
pairs(x) if x is a matrix or a data frame, make a binary graph between the columns of X
plot.ts(x) If x is an object of class "ts", as a time series curve of x, x can be multivariate, but the sequence must have the same frequency and time
ts.plot(x) is the same, but if x is multivariate, the sequences can have different times but must have the same frequency
Frequency histogram of hist(x) x
Barplot(x) Bar graph of the value of X
qqnorm(x) normal quantile-quantile map
qqplot(x, y) y quantile-quantile map of X
contour(x, y, z) contour plot (values that are interpolated to fill in the blanks when drawing curves), X and y must be vectors and Z must be matrices so that dim(z)=c(length(x), length(y)) (x and y can be omitted)
filled.contour (x, y, z) Same as above, the area between contours is colored, and a legend is drawn with the corresponding color values
image(x, y, z) is the same, but the actual data size is represented in different colors
persp(x, y, z) Same as above, but Perspective
stars(x) Draw with stars and segments if x is a matrix or data frame
symbols(x, y,...) Draw symbols given by X and Y coordinates (circle, square, rectangle, star, thermometer or box), the type, size, color of the symbols are specified by another variable
(partial) influence diagram of termplot(mod.obj) regression model (mod.obj)
add=FALSE If TRUE, overlay the graph onto the previous graph (if any)
axes=TRUE If FALSE, do not draw axes and borders
Type="p" specifies the type of graphic, "p": point, "l": line, "b": point connection, "o": same, but line at point, "h": vertical line, "s": ladder, top of vertical line
Display data, "S": Same as above, but display data at the bottom of the vertical line
xlim=, ylim=Specifies the upper and lower limits of the axis, such as xlim=c(1,10) or xlim=range(x)
xlab=, ylab=label for axis, must be a character value
Main = main title, must be a character value
sub = subtitle (in small font)
Low level drawing commands
R's low-level mapping commands work on existing graphics, and the following table gives some main points:
Function Name Function
points(x, y) add points (you can use option type=)
lines(x, y) same as above, but add lines
text(x, y, labels,...)
Add labels to (x,y). Typical uses are plot(x, y,type="n"); text(x, y, names)
mtext(text, side=3,line=0,...) Add text specified by text to the margin, specify which side to add (refer to axis () below); line specifies the number of lines of text added from the drawing area
segments(x0, y0, x1,y1)
Draw segments from points (x0,y0) to points (x1,y1)
arrows(x0, y0,x1, y1, angle= 30,code=2) are the same, but with arrows drawn. If code=2, arrows are drawn at each (x0,y0); if code=1, arrows are drawn at each (x0,y0).
Draw arrows at each (x1,y1); if code=3, draw arrows angle control arrow axis to arrow at both ends
Angle of side.
abline(a,b) draws lines with slope B and intercept a
abline(h=y) draws a horizontal line at ordinate y
abline(v=x) draws a vertical line at x
abline(lm.obj) draws a regression line determined by lm.obj
Rect (x1, y1, x2,y2) Draws a rectangle, (x1, y1) is the lower left corner, (x2,y2) is the upper right corner
Polygon (x,y) draws polygons that connect points determined by x,y coordinates
legend(x, y, legend) adds a legend at a point (x,y) indicating that the content is given by Legend
title() Add a title or a subtitle
axis(side, vect) draws the axis. side=1 draws below; side=2 draws on the left; side=3 draws above;
Draw on the right when side=4. Optional parameter at specifies the coordinates of the position where the scale line is drawn
box() with a border on the current diagram
rug(x) Draws the position of the x-data with a short line on the x-axis
locator(n, type="n",...) Return the coordinates of N clicks (x,y) after the user clicks n times with the mouse on the graph; and draw at the clicks
Symbols (when type="p") or lines (when type="l") are not drawn by default
Drawing parameters
In addition to low-level drawing commands, graphic display can also be improved with drawing parameters.Drawing parameters
It can be used as an option for graphical functions (but not for all parameters), or as a function par(
To permanently change drawing parameters, that is, subsequent graphics will follow the parameters specified by the function par()
For example, the following command:
par(bg="yellow")
This will cause subsequent graphics to be drawn against a yellow background. There are 73 drawing parameters, some of which are very
Similar functionality. A detailed list of these parameters is available through help(par). The table below contains only columns
The most commonly used parameters are listed.
Parameter function
> adj controls the alignment of text: 0 is left alignment, 0.5 is center alignment, 1 is right alignment, value? 1 is the alignment position on the right side of the text, negative is the alignment position on the left side of the text; if two values are given (e.g. c(0, 0)), the second only controls the vertical alignment of the text baseline > BG Specifies the background color (e.g. bg="red", bg="blue"; 657 available color names can be displayed with colors()) > BTY controls the shape of the graphic border. The available values are:'o','l','7','c','u'and']' (the border and character look the same); if bty='n', n o border is drawn > CEX controls the value of symbol and text size in the default state; in addition, cex.axis controls the coordinate axis scale number size, cex.lab controls #Bubble chart axis label text size can be represented with cex=sqrt(w$profits), cex.main controls the caption text size, cex.sub controls the subtitle text size > col controls the color of the symbol; similar to cex, available: col.axis, col.lab, col.main, col.sub font Integers that control the font of text (1: normal, 2: italic, 3: bold, 4: bold italic); similar to cex, can also be used: font.axis, font.lab,font.main, font.sub. > Las Controls the direction of the numeric marker on the axis scale (0:parallel to axis, 1:horizontal, 2:perpendicular to axis, 3:vertical) > lty controls the line type of the line, which can be an integer (1:solid, 2:dashed, 3:dotted, 4:dotted, 5:long dashed, 6:double dashed) or a string of no more than eight characters (characters are numbers from 0 to 9) that alternately specifies the length of the line and the blank in points or pixels, such as lty="44" and lty=2, which have the same effect > LWD number to control connection width > The Mar control graph has a four-value vector c(bottom, left, top, right) with a default value of c(5.1, 4.1, 4.1, 2.1) > Vectors of mfcol C (nr, nc), splitting the drawing window into a matrix layout of NR rows and NC columns, using each child window mfrow in column order as above, but using each child window in row order > The type of PCH control symbol can be an integer from 1 to 25 or a single character in''(see Figure 2.4) pch corresponds to the shape of the scatter plot ps >Integer controlling text size in points > Pty Character specifying the type of drawing area,'s': Square,'m': Maximum use > TCK Specifies the value of the scale length on the axis, in percentage, based on the smallest width and high school of the graph; if tck=1, grid tcl is drawn >Same as above, but based on text line height (tcl=-0.5 by default) > xaxt If xaxt="n", set the x-axis but do not display it (help and axis(side=1, >...) Combined use) > yaxt Sets the y-axis if yaxt="n" but does not show it (helps with axis(side=2,...) > prod=T ordinate units in probability
Save Drawing Results Method
> plot(rnorm(1000)) > png("scatterplot. png") > plot(rnorm(1000)) > dev.off() windows 2
Point Chart
install.packages("reshape") library(reshape) sales<-melt(citysales) sales$color[sales[,2]=="ProductA"] <- "red" sales$color[sales[,2]=="ProductB"] <- "blue" sales$color[sales[,2]=="ProductC"] <- "violet" dotchart(sales[,3], labels=sales$City, groups=sales[,2], col=sales$color, pch=19, #Shape of pch corresponding scatterplot points main="Sales Figures", xlab="Sales Revenue (1,000's of USD)")
Pie chart
browsers<-read.table("browsers.txt",header=TRUE) browsers<-browsers[order(browsers[,2]),] pie(browsers[,2], labels=browsers[,1],#Label in Diagram clockwise=TRUE,#Clockwise drawing radius=1,#Pie radius col=brewer.pal(7,"Set1"),#Color, palette border="white",#Frame main="Percentage Share of InternetBrowserusage")
Label percentages on pie charts
browsers<-read.table("browsers.txt",header=TRUE) browsers<-browsers[order(browsers[,2]),] pielabels <- sprintf("%s = %3.1f%s", browsers[,1],100*browsers[,2]/sum(browsers[,2]), "%")##Label percentages on pie charts pie(browsers[,2], labels=pielabels, clockwise=TRUE, radius=1, col=brewer.pal(7,"Set1"), border="white", cex=0.8, main="Percentage Share of Internet Browser usage")
Add illustrations
browsers<-read.table("browsers.txt",header=TRUE) browsers<-browsers[order(browsers[,2]),] pielabels <- sprintf("%s = %3.1f%s", browsers[,1], 100*browsers[,2]/sum(browsers[,2]), "%") pie(browsers[,2], labels=NA, clockwise=TRUE, col=brewer.pal(7,"Set1"), border="white", radius=0.7, cex=0.8, main="Percentage Share of Internet Browser usage") legend("bottomright",legend=pielabels,bty="n",fill=brewer.pal(7,"Set1"))##Add illustrations
histogram
Use Chapter VI Data
air<-read.csv("airpollution.csv") hist(air$Nitrogen.Oxides,xlab="Nitrogen OxideConcentrations",main="Distribution of NitrogenOxide Concentrations")
Display in probability density
hist(air$Nitrogen.Oxides, freq=FALSE, xlab="Nitrogen Oxide Concentrations", main="Distribution of Nitrogen Oxide Concentrations")
Increase breaks
hist(air$Nitrogen.Oxides, breaks=20,xlab="Nitrogen Oxide Concentrations", main="Distribution of Nitrogen Oxide Concentrations")
Specify breaks range
hist(air$Nitrogen.Oxides, breaks=c(0,100,200,300,400,500,600), xlab="Nitrogen Oxide Concentrations", main="Distribution of Nitrogen Oxide Concentrations")
Beautify with color
hist(air$Respirable.Particles, prob=TRUE,col="black",border="wh ite", xlab="Respirable Particle Concentrations", main="Distribution of Respirable Particle Concentrations")
Beautify with lines
par(yaxs="i",las=1) hist(air$Respirable.Particles, prob=TRUE,col="black",border="whi te", xlab="Respirable Particle Concentrations", main="Distribution of Respirable Particle Concentrations") box(bty="l") grid(nx=NA,ny=NULL,lty=1,lwd=1,co l="gray")
Identity Density Function
par(yaxs="i",las=1) hist(air$Respirable.Particles, prob=TRUE,col="black",border="white", xlab="Respirable Particle Concentrations", main="Distribution of Respirable Particle Concentrations") box(bty="l") lines(density(air$Respirable.Particles,na.r m=T),col="red",lwd=4) grid(nx=NA,ny=NULL,lty=1,lwd=1,col=" gray")
A set of histograms
panel.hist <- function(x, ...) { par(usr = c(par("usr")[1:2], 0, 1.5) ) hist(x, prob=TRUE,add=TRUE,col="bla ck",border="white") } plot(iris[,1:4], main="Relationships between characteristics of iris flowers", pch=19,col="blue",cex=0.9, diag.panel=panel.hist)