这是一个使用大部分建议的R函数的第一次尝试:
require( "hablar" )
require( "DescTools" )
is_factor <- function( x, unique.p=0.10, unique.n=(length(x)*unique.p),
first.n=25, max.v=2, b.to.f=FALSE )
{
cat( paste0( "\n----------------- ", deparse(substitute(x)), "\n\n" ) )
if( is.numeric(x) | is.logical(x) )
{ x <- x[ is.finite(x) ] }
if( is.character(x) )
{
x[ x == "NaN" | x == "Inf" ] <- NA
x <- na.omit(x)
}
n <- length(x)
if( n == 0 )
{
cat( "The variable is empty (all NAs)" )
return(FALSE)
}
cat( paste0( "Valid N (after NA drop) = ", n, "\n" ) )
cat( paste0( "Unique levels/values of x = ", length(unique(x)), "\n" ) )
cat( paste0( "unique.n argument = ", unique.n, "\n" ) )
cat( paste0( "unique.p argument = ", unique.p, "\n\n" ) )
if( "factor" %in% class(x) )
{
cat( "has class FACTOR \n" )
cat( paste0( "Values of x: \n", paste( head( unique(x), 10 ), collapse=",\n" ), "\n\n" ) )
cat( "#### IS FACTOR #### \n\n\n" )
return(TRUE)
}
if( "logical" %in% class(x) )
{
cat( "has class LOGICAL: is NOT a factor \n\n" )
cat( paste0( "Values of x: \n", paste( head( unique(x), 10 ), collapse=",\n" ), "\n\n" ) )
return(FALSE)
}
if( any( DescTools::IsDate(x) ) )
{
x.dates <- x[ DescTools::IsDate(x) ]
cat( "x has class DATE: is NOT a factor \n" )
cat( paste0( "Values of x: \n", paste( head( unique(x.dates), 10 ), collapse=",\n" ) ) )
return(FALSE)
}
if( "character" %in% class(x) )
{
cat( "x has class CHARACTER: \n\n" )
if( length(unique(x)) == 1 )
{
cat( "All values of x are the same: \n" )
cat( paste0( "Values of x: \n", paste( head( unique(x), 10 ), collapse=",\n" ), "\n" ) )
if( b.to.f )
{
cat( "Convert binary to factor is set to TRUE \n\n" )
cat( "#### IS FACTOR #### \n\n\n" )
return(TRUE)
}
cat( "Convert binary to factor is set to FALSE \n\n" )
return(FALSE)
}
is.same <- length( unique( nchar(x) ) ) == 1 & length(unique(x)) < ( n * unique.p )
if( is.same )
{
cat( "All strings have the same number of characters \n\n" )
cat( paste0( "Values of x (first 10): \n", paste( head( unique(x), 10 ), collapse=",\n" ), "\n\n" ) )
}
n.unique <- length( unique( x ) )
p.unique <- length( unique( x ) ) / n
is.small.unique.n <- n.unique <= unique.n & p.unique <= unique.p
if( is.small.unique.n )
{
cat( "x has a small number & proportion of unique cases\n" )
cat( paste0( "N < ", unique.n, " & prop < ", unique.p, "\n" ) )
cat( paste0( "Number of unique values of x: ", length(unique(x)), "\n" ) )
cat( paste0( "Values of x (first 10): \n", paste( head( sort(unique(x)), 10 ), collapse=",\n" ), "\n\n" ) )
}
first.n.total <- table(x) %>% sort(desc=T) %>% head( first.n ) %>% sum()
total.p <- first.n.total / n
is.large.p.total <- total.p > 0.90
first.n.levels <- table(x) %>% sort(desc=T) %>% head( first.n ) %>% names()
if( is.large.p.total )
{
cat( paste0( "First ", first.n, " levels accounts for > 90% of total cases \n" ) )
cat( paste0( "First N levels: \n", paste( first.n.levels, collapse=",\n" ), "\n\n" ) )
}
if( is.same | is.small.unique.n | is.large.p.total )
{
cat( "#### IS FACTOR #### \n\n\n" )
return(TRUE)
}
}
x <- hablar::retype(x)
if( "numeric" %in% class(x) )
{
cat( "x is non-integer number: NOT a factor \n\n" )
cat( paste0( "Values of x (first 10): \n", paste( head( unique(x), 10 ), collapse=",\n" ), "\n\n" ) )
return(FALSE)
}
if( "integer" %in% class(x) )
{
cat( "x has class INTEGER: \n\n" )
if( all( x %in% c(0,1) ) | length(unique(x))==1 )
{
cat( "All values of x are 0/1 or a single value: \n" )
cat( paste0( "Values of x: \n", paste( head( unique(x), 10 ), collapse=",\n" ), "\n" ) )
if( b.to.f )
{
cat( "Convert binary to factor is set to TRUE \n\n" )
cat( "#### IS FACTOR #### \n\n\n" )
return(TRUE)
}
cat( "Convert binary to factor is set to FALSE \n\n" )
return(FALSE)
}
if( any( x < 0 ) )
{
cat( "Contains negative integers \n" )
cat( paste0( "Range x: ", range(x), "\n\n" ) )
return(FALSE)
}
n.unique <- length( unique( x ) )
p.unique <- length( unique( x ) ) / n
is.small.unique.n <- n.unique <= unique.n & p.unique <= unique.p
if( is.small.unique.n )
{
cat( "x has a small number & proportion of unique cases \n" )
cat( paste0( "unique(x) < ", unique.n, " & unique(x)/length(x) < ", unique.p, " \n" ) )
cat( paste0( "Number of unique values of x: ", length(unique(x)), "\n" ) )
cat( paste0( "Values of x (first 10): \n", paste( head( sort(unique(x)), 10 ), collapse=",\n" ), "\n\n" ) )
}
starts.with.one <- min(x) == 1
width.of.range.x <- max(x) - min(x) + 1
is.approx.seq <- length(unique(x)) / width.of.range.x > 0.8
is.seq.from.one <- starts.with.one & is.approx.seq
if( is.seq.from.one )
{ cat( "x is an approximate sequence of integers starting with one \n\n" ) }
is.true.seq <- length(unique(x)) == width.of.range.x &
length(unique(x))/length(x) < unique.p
if( is.true.seq )
{
cat( "x is a true sequence of integers \n" )
cat( paste0( "Values: \n", paste( sort(unique(x)), collapse=",\n" ), "\n\n" ) )
}
is.equal.intervals <- length( unique( x[-1] - x[-length(x)] ) ) == 1
if( is.equal.intervals )
{
cat( "All values of x have equal intervals between them \n" )
cat( paste0( "Values: ", paste( head(sort(unique(x))), collapse="," ), "\n\n" ) )
}
is.small.var <- var(x) < max.v
if( is.small.var )
{ cat( paste0( "The variance of x is below ", max.v, "\n\n" ) ) }
if( is.small.unique.n | is.seq.from.one | is.true.seq | is.equal.intervals )
{
cat( "#### IS FACTOR #### \n\n\n" )
return(TRUE)
}
}
cat( "There are a large number of unique values: x is NOT a factor \n" )
cat( paste0( "Number of unique values of x: ", length(unique(x)), "\n" ) )
cat( paste0( "Values of x (first 10): \n", paste( head( sort(unique(x)), 10 ), collapse=",\n" ), "\n\n" ) )
return( FALSE )
}
样例数据集:mtcars:
可能的因素包括:
- cyl(汽缸数量)
- gear(变速器数量)
- carb(化油器数量)
- vs(0/1表示V形或直立发动机)
- am(0/1表示自动或手动变速器)
Cyl和gear被标记为因素。Carb有6个唯一值或6/32 = 18%的独特比例得分,高于unique.p设置的10%阈值。
这些参数将对样本大小敏感-例如,在具有几百个地址的数据集中,50个唯一状态代码代表了总值的很大比例,但是州的数量不会随着大小而增长,因此独特值占总案例的比例随着数据集的增长而自然变小。这些演示数据集是敏感的。
如果您想要将二进制变量标记为因素,则可以将参数“b.to.f”设置为TRUE:例如在这种情况下,VS和AM。
> head( mtcars )
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
lapply( mtcars, is_factor )
----------------- mpg
Valid N (after NA drop) = 32
Unique levels/values of x = 25
unique.n argument = 3.2
unique.p argument = 0.1
x is non-integer number: NOT a factor
Values of x (first 10):
21,
22.8,
21.4,
18.7,
18.1,
14.3,
24.4,
19.2,
17.8,
16.4
----------------- cyl
Valid N (after NA drop) = 32
Unique levels/values of x = 3
unique.n argument = 3.2
unique.p argument = 0.1
x has class INTEGER:
x has a small number & proportion of unique cases
unique(x) < 3.2 & unique(x)/length(x) < 0.1
Number of unique values of x: 3
Values of x (first 10):
4,
6,
8
----------------- disp
Valid N (after NA drop) = 32
Unique levels/values of x = 27
unique.n argument = 3.2
unique.p argument = 0.1
x is non-integer number: NOT a factor
Values of x (first 10):
160,
108,
258,
360,
225,
146.7,
140.8,
167.6,
275.8,
472
----------------- hp
Valid N (after NA drop) = 32
Unique levels/values of x = 22
unique.n argument = 3.2
unique.p argument = 0.1
x has class INTEGER:
There are a large number of unique values: x is NOT a factor
Number of unique values of x: 22
Values of x (first 10):
52,
62,
65,
66,
91,
93,
95,
97,
105,
109
----------------- drat
Valid N (after NA drop) = 32
Unique levels/values of x = 22
unique.n argument = 3.2
unique.p argument = 0.1
x is non-integer number: NOT a factor
Values of x (first 10):
3.9,
3.85,
3.08,
3.15,
2.76,
3.21,
3.69,
3.92,
3.07,
2.93
----------------- wt
Valid N (after NA drop) = 32
Unique levels/values of x = 29
unique.n argument = 3.2
unique.p argument = 0.1
x is non-integer number: NOT a factor
Values of x (first 10):
2.62,
2.875,
2.32,
3.215,
3.44,
3.46,
3.57,
3.19,
3.15,
4.07
----------------- qsec
Valid N (after NA drop) = 32
Unique levels/values of x = 30
unique.n argument = 3.2
unique.p argument = 0.1
x is non-integer number: NOT a factor
Values of x (first 10):
16.46,
17.02,
18.61,
19.44,
20.22,
15.84,
20,
22.9,
18.3,
18.9
----------------- vs
Valid N (after NA drop) = 32
Unique levels/values of x = 2
unique.n argument = 3.2
unique.p argument = 0.1
x has class INTEGER:
All values of x are 0/1 or a single value:
Values of x:
0,
1
Convert binary to factor is set to FALSE
----------------- am
Valid N (after NA drop) = 32
Unique levels/values of x = 2
unique.n argument = 3.2
unique.p argument = 0.1
x has class INTEGER:
All values of x are 0/1 or a single value:
Values of x:
1,
0
Convert binary to factor is set to FALSE
----------------- gear
Valid N (after NA drop) = 32
Unique levels/values of x = 3
unique.n argument = 3.2
unique.p argument = 0.1
x has class INTEGER:
x has a small number & proportion of unique cases
unique(x) < 3.2 & unique(x)/length(x) < 0.1
Number of unique values of x: 3
Values of x (first 10):
3,
4,
5
x is a true sequence of integers
Values:
3,
4,
5
The variance of x is below 2
----------------- carb
Valid N (after NA drop) = 32
Unique levels/values of x = 6
unique.n argument = 3.2
unique.p argument = 0.1
x has class INTEGER:
There are a large number of unique values: x is NOT a factor
Number of unique values of x: 6
Values of x (first 10):
1,
2,
3,
4,
6,
8
$mpg
[1] FALSE
$cyl
[1] TRUE
$disp
[1] FALSE
$hp
[1] FALSE
$drat
[1] FALSE
$wt
[1] FALSE
$qsec
[1] FALSE
$vs
[1] FALSE
$am
[1] FALSE
$gear
[1] TRUE
$carb
[1] FALSE
unique
函数对所有列进行去重。如果只有几个“级别”,那么它可能是一个分类变量。要一次性完成,请使用apply(cars,2,unique)
。 - Rentrop