jacobkap / predictrace
Showing 1 of 172 files from the diff.
Other files ignored by Codecov
NAMESPACE has changed.
docs/LICENSE.html has changed.
R/sysdata.rda has changed.
docs/index.html has changed.
docs/pkgdown.yml has changed.
NEWS.md has changed.
R/data.R has changed.
CRAN-RELEASE was deleted.
README.Rmd has changed.
docs/404.html has changed.
docs/authors.html has changed.
README.md has changed.
DESCRIPTION has changed.

@@ -1,18 +1,26 @@
Loading
1 -
#' Find the race of a surname
1 +
#' Find the race of a surname or first name
2 +
#'
3 +
#' The surname data comes from the United States Census. The first name
4 +
#' data comes from Tzioumis (2018, <dx.doi.org/10.1038/sdata.2018.25>)
2 5
#'
3 6
#' @param name
4 -
#' String or vector of strings of surname that you want to know the race of.
7 +
#' String or vector of strings of surname or first name that you want to know the race of.
5 8
#' @param probability
6 9
#' If TRUE (default) will provide columns for each race with the probability
7 10
#' that the surname is of that race. If FALSE, will only return the name,
8 11
#' the match-name from the Census data, and the most likely race.
12 +
#' @param surname
13 +
#' If TRUE (default) will return the race based on the inputted name being a surname.
14 +
#' If FALSE, will return the race based on the inputted name being a first name.
9 15
#'
10 16
#' @return
11 17
#' A data.frame with three or nine columns: The first column has the name as inputted,
12 18
#' the second column has the cleaned up name (no spaces or punctuation, all
13 -
#' lowercase), the third column tells the likely race of the surname (if there are multiple races with the same probability of a match, it will be a string with each race separated by a comma). If the
19 +
#' lowercase), the third column tells the likely race of the surname or first name
20 +
#' (if there are multiple races with the same probability of a match, it will be
21 +
#' a string with each race separated by a comma). If the
14 22
#' parameter probability is false, these three columns are all that is returned.
15 -
#' Otherwise, columns 4-9 tell the specific probability that the surname
23 +
#' Otherwise, columns 4-9 tell the specific probability that the surname or first name
16 24
#' is each race.
17 25
#' @export
18 26
#'
@@ -21,7 +29,9 @@
Loading
21 29
#'
22 30
#' predict_race(c("franklin", "Washington", "Jefferson", "Sotomayor", "Liu"))
23 31
#' predict_race("franklin", probability = FALSE)
24 -
predict_race <- function(name, probability = TRUE) {
32 +
#' predict_race("jacob", probability = FALSE, surname = FALSE)
33 +
#' predict_race("jacob", probability = TRUE, surname = FALSE)
34 +
predict_race <- function(name, probability = TRUE, surname = TRUE) {
25 35
26 36
  if (!is.character(name)) {
27 37
    stop("name must be a character type.")
@@ -38,7 +48,11 @@
Loading
38 48
39 49
  data$name <- tolower(data$name)
40 50
  data$name <- gsub("[[:punct:]]| ", "", data$name)
41 -
  data <- dplyr::left_join(data, predictrace::surnames_race, by = "name")
51 +
  if (surname) {
52 +
    data <- dplyr::left_join(data, predictrace::surnames_race, by = "name")
53 +
  } else {
54 +
    data <- dplyr::left_join(data, predictrace::first_names_race, by = "name")
55 +
  }
42 56
43 57
44 58
  names(data) <- gsub("^name$", "match_name", names(data))
@@ -62,3 +76,77 @@
Loading
62 76
  data$name <- as.character(data$name)
63 77
  return(data)
64 78
}
79 +
80 +
81 +
82 +
83 +
#' Find the gender of a first name
84 +
#'
85 +
#' The surname data comes from the United States Social Security Administration (SSA).
86 +
#' This data has the number of people with that name that are identified as female
87 +
#' or male so the probability female/male is the proportion of all people with that
88 +
#' name that are female/male. SSA data is available annually from 1880-2019, this
89 +
#' aggregates all years together.
90 +
#'
91 +
#' @param name
92 +
#' String or vector of strings of  the first name that you want to know the gender of.
93 +
#' @param probability
94 +
#' If TRUE (default) will provide columns for each race with the probability
95 +
#' that the first name is of that gender If FALSE, will only return the name,
96 +
#' the match-name from the SSA data, and the most likely gender.
97 +
#'
98 +
#' @return
99 +
#' A data.frame with three or nine columns: The first column has the name as inputted,
100 +
#' the second column has the cleaned up name (no spaces or punctuation, all
101 +
#' lowercase), the third column tells the likely gender of the first name
102 +
#' (if there are multiple genders with the same probability of a match, it will be
103 +
#' a string with each race separated by a comma). If the
104 +
#' parameter probability is false, these three columns are all that is returned.
105 +
#' Otherwise, columns 4-5 tell the specific probability that the surname is female
106 +
#' or male.
107 +
#' @export
108 +
#'
109 +
#' @examples
110 +
#' predict_gender("tyrion")
111 +
#'
112 +
#' predict_gender(c("harry", "ron", "hermione", "DEAN", "NEVILLE", "Cho"))
113 +
#' predict_gender("franklin", probability = FALSE)
114 +
#' predict_gender("jacob", probability = FALSE)
115 +
#' predict_gender("jacob", probability = TRUE)
116 +
predict_gender <- function(name, probability = TRUE) {
117 +
118 +
  if (!is.character(name)) {
119 +
    stop("name must be a character type.")
120 +
  }
121 +
122 +
  if (is.numeric(probability) ||
123 +
      length(probability) != 1 ||
124 +
      !probability %in% c(TRUE, FALSE)) {
125 +
    stop("probability must either be TRUE or FALSE.")
126 +
  }
127 +
128 +
  data <- data.frame(old_name = name,
129 +
                     name     = name)
130 +
131 +
  data$name <- tolower(data$name)
132 +
  data$name <- gsub("[[:punct:]]| ", "", data$name)
133 +
  data <- dplyr::left_join(data, predictrace::first_names_gender, by = "name")
134 +
135 +
136 +
  names(data) <- gsub("^name$", "match_name", names(data))
137 +
  names(data) <- gsub("^old_name$", "name", names(data))
138 +
  if (probability == FALSE) {
139 +
    data <- data[, c("name",
140 +
                     "match_name",
141 +
                     "likely_gender")]
142 +
  } else {
143 +
    data <- data[, c("name",
144 +
                     "match_name",
145 +
                     "likely_gender",
146 +
                     "probability_female",
147 +
                     "probability_male")]
148 +
  }
149 +
  data      <- data.frame(data, stringsAsFactors = FALSE)
150 +
  data$name <- as.character(data$name)
151 +
  return(data)
152 +
}
Files Coverage
R/main.R 100.00%
Project Totals (1 files) 100.00%
Notifications are pending CI completion. Waiting for GitHub's status webhook to queue notifications. Push notifications now.
1
comment: false
2

3
coverage:
4
  status:
5
    project:
6
      default:
7
        target: auto
8
        threshold: 1%
9
    patch:
10
      default:
11
        target: auto
12
        threshold: 1%
Sunburst
The inner-most circle is the entire project, moving away from the center are folders then, finally, a single file. The size and color of each slice is representing the number of statements and the coverage, respectively.
Icicle
The top section represents the entire project. Proceeding with folders and finally individual files. The size and color of each slice is representing the number of statements and the coverage, respectively.
Grid
Each block represents a single file in the project. The size and color of each block is represented by the number of statements and the coverage, respectively.
Loading