#load tidyverse up
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(readxl)
library(glue)
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
In this document, I will introduce the geom_dotplot() function and show what it’s for.
Dot plots are a type of graph that displays the distribution of a variable by placing a dot for each observation along a horizontal axis. The dots are stacked vertically if they have the same value on the x-axis. Dot plots work very well for data with a small number of values. They would not work well for large sets of data, because a dot would need to be plotted for each value.
Simple data set to understand Dot-Plot
# student score out of 100
Studnet_score <- data.frame(student = c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22),
score = c(40, 50, 55, 65, 70, 70, 70, 70, 70, 75, 75, 75, 75, 85, 85, 90, 91, 94, 94, 97, 97, 100))
Studnet_score
## student score
## 1 1 40
## 2 2 50
## 3 3 55
## 4 4 65
## 5 5 70
## 6 6 70
## 7 7 70
## 8 8 70
## 9 9 70
## 10 10 75
## 11 11 75
## 12 12 75
## 13 13 75
## 14 14 85
## 15 15 85
## 16 16 90
## 17 17 91
## 18 18 94
## 19 19 94
## 20 20 97
## 21 21 97
## 22 22 100
score<-ggplot(Studnet_score, aes(x=score)) +
geom_dotplot()
score
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
score <- ggplot(Studnet_score, aes(x = score)) +
geom_dotplot(binwidth = 0.75,
stackdir = "center", fill="#FFAAD4")
score
Data form :- https://www.kaggle.com/datasets/harsh45/random-salary-data-of-employes-age-wise
# Load data Set
Salary_data <- read_csv("~/Downloads/R Programing /sph_r_programming_class_project_folders_2023 (1)/data/Salary_dataset.csv",
na = "NA")
## New names:
## Rows: 30 Columns: 3
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (3): ...1, YearsExperience, Salary
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
glimpse(Salary_data)
## Rows: 30
## Columns: 3
## $ ...1 <dbl> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, …
## $ YearsExperience <dbl> 1.2, 1.4, 1.6, 2.1, 2.3, 3.0, 3.1, 3.3, 3.3, 3.8, 4.0,…
## $ Salary <dbl> 39344, 46206, 37732, 43526, 39892, 56643, 60151, 54446…
There are two tye of Dot plot, Wilkinson Dot Plot and cleveland dot plot.
Wilkinson Dot Plot
ggplot(Salary_data, aes(YearsExperience))+
geom_dotplot()
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
ggplot(Salary_data, aes(Salary))+
geom_dotplot()
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
Cleveland Dot Plot
ggplot(Salary_data, aes(x=YearsExperience, y= Salary))+
geom_point()+
labs(x="Employee")
# Add violin plot
C <- ggplot(Salary_data, aes(x=...1, y= Salary)) +
geom_violin(trim = FALSE)+
geom_dotplot(binaxis='y', stackdir='center', fill="#E69F00")+
labs(x="Employee")
C
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
Adding box plot
# dot plots combined with box plots
e <- ggplot(Salary_data, aes(x=...1, y= Salary))
e + geom_boxplot(width = 0.5) +
geom_dotplot(binaxis = "y", stackdir = "center", fill = "lightgray") +
labs(x="Employee")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
median(Salary_data$Salary)
## [1] 65238
# Add notched box plot
B <- ggplot(Salary_data, aes(x=...1, y=Salary)) +
geom_boxplot(notch = TRUE)+
geom_dotplot(binaxis='y', stackdir = "center")+
labs(x="Employee")
B
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
## notch went outside hinges. Try setting notch=FALSE.
Discuss whether you think this function is useful for you and your work. Is it the best thing since sliced bread, or is it not really relevant to your work?
Yes, dot plots was useful for my data. It was showing me individual data points, unlike other types of plots that aggregate data, such as histograms or box plots, dot plots display each of my individual data point as a dot, which was really helpful for identifying patterns and outliers. It was very easy and clear to read my data.
However, there are also some potential limitations of dot plots, such as the possibility of overplotting when there are many data points, and the difficulty of comparing the distribution of multiple variables or groups on the same plot.
Overall, whether or not dot plots are useful depends on the specific data being analyzed and the research question being addressed. It’s always a good idea to consider different types of plots and choose the one that best suits the data and the purpose of the analysis.