1
|
|
# syntheticbench - R package for benchmarking of dataset serialization
|
2
|
|
#
|
3
|
|
# Copyright (C) 2019-present, Mark AJ Klik
|
4
|
|
#
|
5
|
|
# This file is part of the lazyvec R package.
|
6
|
|
#
|
7
|
|
# The lazyvec R package is free software: you can redistribute it and/or modify it
|
8
|
|
# under the terms of the GNU Affero General Public License version 3 as
|
9
|
|
# published by the Free Software Foundation.
|
10
|
|
#
|
11
|
|
# The lazyvec R package is distributed in the hope that it will be useful, but
|
12
|
|
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
13
|
|
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License
|
14
|
|
# for more details.
|
15
|
|
#
|
16
|
|
# You should have received a copy of the GNU Affero General Public License along
|
17
|
|
# with the lazyvec R package. If not, see <http://www.gnu.org/licenses/>.
|
18
|
|
#
|
19
|
|
# You can contact the author at:
|
20
|
|
# - syntheticbench R package source repository : https://github.com/fstpackage/syntheticbench
|
21
|
|
|
22
|
|
|
23
|
|
# predefined allowed characters
|
24
|
|
char_pool <- c(LETTERS, letters, 0:9)
|
25
|
|
|
26
|
|
|
27
|
|
generate_string <- function(size) {
|
28
|
0
|
paste0(sample(char_pool, size), collapse = "")
|
29
|
|
}
|
30
|
|
|
31
|
|
|
32
|
|
#' Generate a character vector with certain distribution of string lengths
|
33
|
|
#'
|
34
|
|
#' @param length length of the vector
|
35
|
|
#' @param max_distict_values maximum number of disctict values in the vector
|
36
|
|
#' @param min_str_size minimum string length
|
37
|
|
#' @param max_str_size maximum string length
|
38
|
|
#'
|
39
|
|
#' @return character vector
|
40
|
|
#' @export
|
41
|
|
sample_string <- function(length, min_str_size = 1, max_str_size = 10, max_distict_values = NULL) {
|
42
|
|
|
43
|
0
|
if (is.null(max_distict_values)) {
|
44
|
0
|
sizes <- sample(min_str_size:max_str_size, length, replace = TRUE)
|
45
|
0
|
return(sapply(sizes, generate_string))
|
46
|
|
}
|
47
|
|
|
48
|
0
|
sizes <- sample(min_str_size:max_str_size, max_distict_values, replace = TRUE)
|
49
|
0
|
x <- sapply(sizes, generate_string) # unique values
|
50
|
0
|
sample(x, length, replace = TRUE)
|
51
|
|
}
|