分割-应用-组合在地理距离计算中的应用

7

我已经从人口普查局下载了美国所有城镇和城市等的列表。以下是一个随机样本:

dput(somewhere)
structure(list(state = structure(c(30L, 31L, 5L, 31L, 24L, 36L, 
13L, 21L, 6L, 10L, 31L, 28L, 10L, 5L, 5L, 8L, 23L, 11L, 34L, 
19L, 29L, 4L, 24L, 13L, 21L, 31L, 2L, 3L, 29L, 24L, 1L, 13L, 
15L, 10L, 11L, 33L, 35L, 8L, 11L, 12L, 36L, 28L, 9L, 31L, 8L, 
14L, 11L, 12L, 36L, 13L, 8L, 5L, 29L, 8L, 7L, 23L, 25L, 39L, 
16L, 28L, 10L, 29L, 26L, 8L, 32L, 40L, 28L, 23L, 37L, 31L, 18L, 
5L, 1L, 31L, 18L, 13L, 11L, 10L, 25L, 18L, 21L, 18L, 11L, 35L, 
31L, 36L, 20L, 19L, 38L, 2L, 40L, 13L, 36L, 11L, 29L, 27L, 22L, 
17L, 12L, 20L), .Label = c("ak", "al", "ar", "az", "ca", "co", 
"fl", "ga", "hi", "ia", "il", "in", "ks", "ky", "la", "md", "mi", 
"mo", "ms", "mt", "nc", "nd", "ne", "nj", "nm", "nv", "ny", "oh", 
"ok", "or", "pa", "pr", "ri", "sc", "sd", "tx", "ut", "va", "wa", 
"wi"), class = "factor"), geoid = c(4120100L, 4280962L, 668028L, 
4243944L, 3460600L, 4871948L, 2046000L, 3747695L, 839965L, 1909910L, 
4244824L, 3902204L, 1963750L, 622360L, 669088L, 1382972L, 3125230L, 
1722736L, 4539265L, 2804705L, 4039625L, 451465L, 3467020L, 2077150L, 
3765260L, 4221792L, 133904L, 566320L, 4033150L, 3463180L, 223460L, 
2013675L, 2232405L, 1951600L, 1752142L, 4445010L, 4655684L, 1336416L, 
1729080L, 1840842L, 4804672L, 3932207L, 1523675L, 4260384L, 1321912L, 
2159232L, 1735307L, 1867176L, 4839304L, 2057350L, 1309656L, 655380L, 
4082250L, 1350680L, 1275475L, 3147745L, 3505010L, 5352285L, 2483337L, 
3909834L, 1912945L, 4068200L, 3227900L, 1366304L, 7286831L, 5505350L, 
3982390L, 3149915L, 4974480L, 4249440L, 2943346L, 677430L, 280770L, 
4247872L, 2902242L, 2039075L, 1735281L, 1932565L, 3580120L, 2973852L, 
3722620L, 2943238L, 1755938L, 4643100L, 4251904L, 4830920L, 3056575L, 
2801940L, 5156048L, 137000L, 5508925L, 2057300L, 4861172L, 1736477L, 
4021200L, 3677783L, 3832060L, 2614900L, 1820332L, 3043750L), 
    ansicode = c(2410344L, 2390453L, 2411793L, 1214759L, 885360L, 
    2412035L, 485621L, 2403359L, 2412812L, 2583481L, 2390095L, 
    2397971L, 2396237L, 2585422L, 2411819L, 2405746L, 2398338L, 
    2394628L, 2812929L, 2586582L, 2408478L, 2582836L, 885393L, 
    2397270L, 2402898L, 2584453L, 2405811L, 2405518L, 2412737L, 
    2389752L, 2418574L, 2393549L, 2402559L, 2629970L, 2399453L, 
    2378109L, 2812999L, 2402563L, 2398956L, 2396699L, 2409759L, 
    2393028L, 2414061L, 2805542L, 2404192L, 2404475L, 2398514L, 
    2629884L, 2408486L, 2396265L, 2405306L, 2411363L, 2413515L, 
    2405064L, 2402989L, 2583899L, 2629103L, 2585016L, 2390487L, 
    2397481L, 2393811L, 2413298L, 2583927L, 2812702L, 2415078L, 
    1582764L, 2400116L, 2400036L, 2412013L, 2633665L, 2787908L, 
    2583158L, 2418866L, 1214943L, 2393998L, 485611L, 2398513L, 
    2394969L, 2806756L, 2397053L, 2406485L, 2395719L, 2399572L, 
    1267480L, 2389516L, 2410660L, 2409026L, 2806379L, 2584894L, 
    2404746L, 2586459L, 2396263L, 2411528L, 2398556L, 2412443L, 
    2584298L, 1036064L, 2806333L, 2396920L, 2804282L), city = c("donald", 
    "warminster heights", "san juan capistrano", "littlestown", 
    "port republic", "taylor", "merriam", "northlakes", "julesburg", 
    "california junction", "lower allen", "antwerp", "pleasantville", 
    "el rancho", "santa clarita", "willacoochee", "kennard", 
    "effingham", "la france", "beechwood", "keys", "orange grove mobile manor", 
    "shiloh", "west mineral", "stony point", "east salem", "heath", 
    "stamps", "haworth", "rio grande", "ester", "clayton", "hackberry", 
    "middle amana", "new baden", "melville", "rolland colony", 
    "hannahs mill", "germantown hills", "la fontaine", "aurora", 
    "green meadows", "kaiminani", "pinecroft", "dawson", "park city", 
    "hinsdale", "st. meinrad", "kingsland", "powhattan", "bowersville", 
    "palos verdes estates", "wyandotte", "meigs", "waverly", 
    "sunol", "arroyo hondo", "outlook", "west pocomoke", "buchtel", 
    "chatsworth", "smith village", "glenbrook", "rock spring", 
    "villalba", "bayfield", "waynesfield", "utica", "sunset", 
    "milford square", "lithium", "swall meadows", "unalaska", 
    "martinsburg", "ashland", "leawood", "hindsboro", "gray", 
    "turley", "trimble", "falcon", "linn", "olympia fields", 
    "mitchell", "mount pleasant mills", "greenville", "park city", 
    "arkabutla", "new river", "huntsville", "boulder junction", 
    "potwin", "red lick", "huey", "dougherty", "wadsworth", "grand forks", 
    "chassell", "edgewood", "lindsay"), lsad = c("25", "57", 
    "25", "21", "25", "25", "25", "57", "43", "57", "57", "47", 
    "25", "57", "25", "25", "47", "25", "57", "57", "57", "57", 
    "21", "25", "57", "57", "43", "25", "43", "57", "57", "25", 
    "57", "57", "47", "57", "57", "57", "47", "43", "25", "57", 
    "57", "57", "25", "25", "47", "57", "57", "25", "43", "25", 
    "43", "25", "57", "57", "57", "57", "57", "47", "25", "43", 
    "57", "57", "62", "25", "47", "47", "25", "57", "57", "57", 
    "25", "21", "25", "25", "47", "25", "57", "25", "43", "25", 
    "47", "25", "57", "25", "57", "57", "57", "25", "57", "25", 
    "25", "47", "43", "57", "25", "57", "43", "57"), funcstat = c("a", 
    "s", "a", "a", "a", "a", "a", "s", "a", "s", "s", "a", "a", 
    "s", "a", "a", "a", "a", "s", "s", "s", "s", "a", "a", "s", 
    "s", "a", "a", "a", "s", "s", "a", "s", "s", "a", "s", "s", 
    "s", "a", "a", "a", "s", "s", "s", "a", "a", "a", "s", "s", 
    "a", "a", "a", "a", "a", "s", "s", "s", "s", "s", "a", "a", 
    "a", "s", "s", "s", "a", "a", "a", "a", "s", "s", "s", "a", 
    "a", "a", "a", "a", "a", "s", "a", "a", "a", "a", "a", "s", 
    "a", "s", "s", "s", "a", "s", "a", "a", "a", "a", "s", "a", 
    "s", "a", "s"), latitude = c(45.221487, 40.18837, 33.500889, 
    39.74517, 39.534798, 30.573263, 39.017607, 35.780523, 40.984864, 
    41.56017, 40.226748, 41.180176, 41.387011, 36.220684, 34.414083, 
    31.335094, 41.474697, 39.120662, 34.616281, 32.336723, 35.802786, 
    32.598451, 39.462418, 37.283906, 35.867809, 40.608713, 31.344839, 
    33.354959, 33.840898, 39.019051, 64.879056, 39.736866, 29.964958, 
    41.794765, 38.536765, 41.559549, 44.3437, 32.937302, 40.768954, 
    40.673893, 33.055942, 39.867193, 19.757709, 40.564189, 31.771864, 
    37.093499, 41.800683, 38.168142, 30.666141, 39.761734, 34.373065, 
    33.774271, 36.807143, 31.071788, 27.985282, 41.154105, 36.534599, 
    46.331153, 38.096527, 39.463511, 42.916301, 35.45079, 39.100123, 
    34.81467, 18.127809, 46.81399, 40.602442, 40.895279, 41.13806, 
    40.433182, 37.831844, 37.50606, 53.910255, 40.310917, 38.812464, 
    38.907263, 39.684775, 41.841711, 36.736661, 39.476152, 35.194804, 
    38.478798, 41.521996, 43.730057, 40.724697, 33.111939, 45.630946, 
    34.700227, 37.142945, 34.782275, 46.1148, 37.938624, 33.485081, 
    38.605285, 34.399808, 42.821447, 47.921291, 47.036116, 40.103208, 
    47.224885), longitude = c(-122.837813, -75.084089, -117.654388, 
    -77.089213, -74.476099, -97.427116, -94.693955, -81.367835, 
    -102.262708, -95.994752, -76.902769, -84.736099, -93.272787, 
    -119.068357, -118.494729, -83.044003, -96.203696, -88.550859, 
    -82.770697, -90.808692, -94.941358, -114.660588, -75.29244, 
    -94.926801, -81.044121, -77.23694, -86.46905, -93.497879, 
    -94.657035, -74.87787, -148.041153, -100.176484, -93.410178, 
    -91.901539, -89.707193, -71.301933, -96.59226, -84.340945, 
    -89.462982, -85.722023, -97.509615, -83.945334, -156.001765, 
    -78.353464, -84.443499, -86.048077, -87.928172, -86.832128, 
    -98.454026, -95.634011, -83.084305, -118.425754, -94.729305, 
    -84.092683, -81.625304, -102.762746, -105.666602, -120.092812, 
    -75.579197, -82.180426, -96.514499, -97.457006, -119.927289, 
    -85.238869, -66.481897, -90.822546, -83.973881, -97.345349, 
    -112.028388, -75.405024, -89.88325, -118.642656, -166.529029, 
    -78.324286, -92.239531, -94.62524, -88.134729, -94.985863, 
    -107.792147, -94.561898, -78.65389, -91.844989, -87.691648, 
    -98.029974, -77.026451, -96.110256, -108.925311, -90.121565, 
    -80.595817, -86.532599, -89.654438, -97.01835, -94.161474, 
    -89.289973, -97.05148, -77.893875, -97.08933, -88.530745, 
    -85.737461, -105.152791), designation = c("city", "cdp", 
    "city", "borough", "city", "city", "city", "cdp", "town", 
    "cdp", "cdp", "village", "city", "cdp", "city", "city", "village", 
    "city", "cdp", "cdp", "cdp", "cdp", "borough", "city", "cdp", 
    "cdp", "town", "city", "town", "cdp", "cdp", "city", "cdp", 
    "cdp", "village", "cdp", "cdp", "cdp", "village", "town", 
    "city", "cdp", "cdp", "cdp", "city", "city", "village", "cdp", 
    "cdp", "city", "town", "city", "town", "city", "cdp", "cdp", 
    "cdp", "cdp", "cdp", "village", "city", "town", "cdp", "cdp", 
    "urbana", "city", "village", "village", "city", "cdp", "cdp", 
    "cdp", "city", "borough", "city", "city", "village", "city", 
    "cdp", "city", "town", "city", "village", "city", "cdp", 
    "city", "cdp", "cdp", "cdp", "city", "cdp", "city", "city", 
    "village", "town", "cdp", "city", "cdp", "town", "cdp")), row.names = c(22769L, 
24845L, 3314L, 24015L, 17360L, 28139L, 10085L, 19881L, 3886L, 
8750L, 24027L, 20585L, 9362L, 2499L, 3333L, 6041L, 16321L, 6847L, 
25249L, 14051L, 22233L, 1210L, 17425L, 10353L, 20053L, 23545L, 
253L, 1951L, 22166L, 17386L, 685L, 9771L, 11134L, 9225L, 7386L, 
25001L, 25862L, 5663L, 6950L, 8239L, 26555L, 20991L, 6108L, 24388L, 
5551L, 10772L, 7056L, 8470L, 27292L, 10202L, 5451L, 3116L, 22660L, 
5776L, 5317L, 16546L, 17582L, 29958L, 12103L, 20709L, 8779L, 
22515L, 16665L, 5902L, 31901L, 30658L, 21745L, 16574L, 28632L, 
24127L, 15046L, 3455L, 930L, 24087L, 14494L, 10016L, 7055L, 8993L, 
18048L, 15434L, 19615L, 15043L, 7454L, 25775L, 24194L, 27115L, 
15857L, 14038L, 29305L, 276L, 30693L, 10201L, 27863L, 7075L, 
22046L, 19267L, 20311L, 12502L, 8093L, 15798L), class = "data.frame")

我想使用longitudelatitude列以及gdist函数计算city列中每个条目之间的距离。我知道以下的for循环可以工作并且易于阅读:

dist_list <- list()
for (i in 1:nrow(somewhere)) {
  
  dist_list[[i]] <- gdist(lon.1 = somewhere$longitude[i], 
                          lat.1 = somewhere$latitude[i], 
                          lon.2 = somewhere$longitude, 
                          lat.2 = somewhere$latitude,
                          units="miles")
  
}

然而:在完整数据集(31K+行)上运行时间非常长,需要几个小时。我正在寻找能够加速计算的方法。我认为采用“分割-应用-合并”方法中的一些方式会很好,因为最终我想涉及一对分组变量,即“geo_block”和“ansi_block”,但实际上任何比目前方法更好的方法都可以。
我尝试了以下方法:
somewhere$geo_block <- substr(somewhere$geoid, 1, 1)
somewhere$ansi_block <- substr(somewhere$ansicode, 1, 1)

somewhere <- somewhere %>%
  split(.$geo_block, .$ansi_block) %>%
  mutate(dist = gdist(longlat = somewhere[, c("longitude", "latitude")]))

但我不确定如何在标准for循环之外指定第二组经纬度输入。

我的问题:

  1. 如何使用split-apply-combine方法解决这个问题,其中将geo_blockansi_block作为分组变量,如上所述? 我想返回最短距离以及对应于此距离的city的名称和geo_block的值。

欢迎所有建议。理想情况下,期望的结果会相当快,因为我实际处理的数据集非常大。由于我有点迷茫,所以我在问题中添加了一些奖励,以产生更多的兴趣,并希望能得到广泛的潜在答案,让我能够学习。非常感谢!


1
如果你想要每一对距离,为什么要使用state作为分组变量?有些州会比其他州拥有更多的位置。相反,你应该使用均等大小的分组,比如每500或1000行。 - Gregor Thomas
这不是我想表达的意思...如果你在10个州中每个州有50个位置,那么按州分组意味着你最大的计算会产生一个50x50的矩阵,而不分组则会创建一个单一的500x500。最终我不确定,这只是我的推断。 - r2evans
2
jvalenti,我不明白你在输出中期望什么。所有点之间的成对距离(无论是否按“state”分组)在维度上要比行数大得多,因此它将无法干净地适合框架中。你想要最接近的吗?距离的统计数据?如果按“state”分组,你是否关心两个位置可能相隔几英尺但不会成对? - r2evans
3
ејәзӣёе…ій—®йўҳпјҡи®Ўз®—еӨ§и·қзҰ»зҹ©йҳөзҡ„жӣҙеҝ«ж–№жі•гҖӮдёҚзҹҘйҒ“gdistзҡ„ж•ҲзҺҮеҰӮдҪ•пјҢдҪҶйӮЈйҮҢзҡ„дёҖдёӘзӯ”жЎҲжҳҫзӨәspatialrisk::haversineжҜ”geosphere::distmеҝ«дёҖдёӘж•°йҮҸзә§гҖӮ - Gregor Thomas
2
还有这个:https://dev59.com/Trjoa4cB1Zd3GeqPFtz-#59915415,提供了一种快速和内存高效的方法来从经纬度创建距离。 - dww
显示剩余4条评论
4个回答

4
我将展示使用tidyverse和基本R的分割-应用-组合方法。我的理解是,对于每个城市组中的每个城市(无论你的分组变量是什么),您想要报告最近的城市和到该最近城市的距离。
首先,关于Imap :: gdist的几点说明:
- 它没有lonlat参数。必须使用(lon | lat) .(1 | 2)这些参数传递坐标。 - 它可能没有正确向量化。它的主体包含一个循环while(abs(lamda - lamda.old)>1e-11){...},其中lamda的值为(lon.1-lon.2)* pi/180。为使循环条件的长度为1(应如此),参数lon.1和lon.2必须具有长度1。
所以,至少现在,我将以更经典的geosphere::distm为基础回答。它将整个n乘n的对称距离矩阵存储在内存中,因此您可能会遇到内存分配限制,但这在分割-应用-组合中发生的可能性要小得多,其中n是每个组(而不是总共)的城市数。
我将使用数据框架中的这部分进行工作:
library("dplyr")

dd <- somewhere %>%
  as_tibble() %>%
  mutate(geo_block = as.factor(as.integer(substr(geoid, 1L, 1L))),
         ansi_block = as.factor(as.integer(substr(ansicode, 1L, 1L)))) %>%
  select(geo_block, ansi_block, city, longitude, latitude)
dd
# # A tibble: 100 × 5
#    geo_block ansi_block city                longitude latitude
#    <fct>     <fct>      <chr>                   <dbl>    <dbl>
#  1 4         2          donald                 -123.      45.2
#  2 4         2          warminster heights      -75.1     40.2
#  3 6         2          san juan capistrano    -118.      33.5
#  4 4         1          littlestown             -77.1     39.7
#  5 3         8          port republic           -74.5     39.5
#  6 4         2          taylor                  -97.4     30.6
#  7 2         4          merriam                 -94.7     39.0
#  8 3         2          northlakes              -81.4     35.8
#  9 8         2          julesburg              -102.      41.0
# 10 1         2          california junction     -96.0     41.6
# # … with 90 more rows

以下函数find_nearest用于在d维度的度量空间中给定一组n个点时,执行识别最近邻居的基本任务。它的参数如下:
  1. data:一个数据框(data frame),有n行(每行代表一个点)、d个变量指定点坐标,以及一个或多个ID变量与每个点关联。
  2. dist:一个函数,接受一个nd的坐标矩阵作为参数,并返回一个相应的nn的对称距离矩阵。
  3. coordvar:一个列出坐标变量名称的字符向量。
  4. idvar:一个列出ID变量名称的字符向量。
在我们的数据框中,坐标变量是longitudelatitude,并且有一个ID变量city。为简单起见,我已经相应地定义了coordvaridvar的默认值。
find_nearest <- function(data, dist, 
                         coordvar = c("longitude", "latitude"), 
                         idvar = "city") {
  m <- match(coordvar, names(data), 0L)
  n <- nrow(data)
  if (n < 2L) {
    argmin <- NA_integer_[n]
    distance <- NA_real_[n]
  } else {
    ## Compute distance matrix
    D <- dist(as.matrix(data[m]))
    ## Extract minimum distances
    diag(D) <- Inf # want off-diagonal distances
    argmin <- apply(D, 2L, which.min)
    distance <- D[cbind(argmin, seq_len(n))]
  }
  ## Return focal point data, nearest neighbour ID, distance
  r1 <- data[-m]
  r2 <- data[argmin, idvar, drop = FALSE]
  names(r2) <- paste0(idvar, "_nearest")
  data.frame(r1, r2, distance, row.names = NULL, stringsAsFactors = FALSE)
}

这是应用于我们数据框的find_nearest的输出结果,没有分组,距离由Vincenty椭球法计算。
dist_vel <- function(x) {
  geosphere::distm(x, fun = geosphere::distVincentyEllipsoid)
}

res <- find_nearest(dd, dist = dist_vel)
head(res, 10L)
#    geo_block ansi_block                city         city_nearest  distance
# 1          4          2              donald              outlook 246536.39
# 2          4          2  warminster heights       milford square  38512.79
# 3          6          2 san juan capistrano palos verdes estates  77722.35
# 4          4          1         littlestown          lower allen  55792.53
# 5          3          8       port republic           rio grande  66935.70
# 6          4          2              taylor            kingsland  98997.90
# 7          2          4             merriam              leawood  13620.87
# 8          3          2          northlakes          stony point  30813.46
# 9          8          2           julesburg                sunol  46037.81
# 10         1          2 california junction              kennard  19857.41

这里是整洁的宇宙(split-apply-combine)技术,按照geo_blockansi_block进行分组:

dd %>%
  group_by(geo_block, ansi_block) %>%
  group_modify(~find_nearest(., dist = dist_vel))
# # A tibble: 100 × 5
# # Groups:   geo_block, ansi_block [13]
#    geo_block ansi_block city                city_nearest  distance
#    <fct>     <fct>      <chr>               <chr>            <dbl>
#  1 1         2          california junction gray            89610.
#  2 1         2          pleasantville       middle amana   122974.
#  3 1         2          willacoochee        meigs          104116.
#  4 1         2          effingham           hindsboro       72160.
#  5 1         2          heath               dawson         198052.
#  6 1         2          middle amana        pleasantville  122974.
#  7 1         2          new baden           huey            37147.
#  8 1         2          hannahs mill        dawson         129599.
#  9 1         2          germantown hills    hindsboro      165140.
# 10 1         2          la fontaine         edgewood        63384.
# # … with 90 more rows

比如说,注意到在这个分组方式下,被认为是距离加利福尼亚交汇点最近的城市从肯纳德变成了格雷,虽然格雷更远。

这里介绍的是基于 R 语言底层函数 by 的分裂-应用-组合方法。除了分组结果的排序方式不同之外,它与前面提到的方法等价:

find_nearest_by <- function(data, by, ...) {
  do.call(rbind, base::by(data, by, find_nearest, ...))
}

res <- find_nearest_by(dd, by = dd[c("geo_block", "ansi_block")], dist = dist_vel)
head(res, 10L)
#    geo_block ansi_block                city city_nearest   distance
# 1          3          1         grand forks         <NA>         NA
# 2          4          1         littlestown  martinsburg  122718.95
# 3          4          1         martinsburg  littlestown  122718.95
# 4          4          1            mitchell  martinsburg 1671365.58
# 5          5          1            bayfield         <NA>         NA
# 6          1          2 california junction         gray   89609.71
# 7          1          2       pleasantville middle amana  122974.32
# 8          1          2        willacoochee        meigs  104116.21
# 9          1          2           effingham    hindsboro   72160.43
# 10         1          2               heath       dawson  198051.76

这个排序揭示了find_nearest对于只包含一个城市的组返回NA。如果我们打印整个tibble,我们将在tidyverse结果中看到这些NA

顺便说一下,以下是geosphere中实现计算大圆距离的各种函数的基准测试,没有关于精度的说明,不过你可以从结果推测出Vincenty椭球距离切割角最少(哈哈)...

fn <- function(dist) find_nearest(dd, dist = dist)

library("geosphere")
dist_geo <- function(x) distm(x, fun = distGeo)
dist_cos <- function(x) distm(x, fun = distCosine)
dist_hav <- function(x) distm(x, fun = distHaversine)
dist_vsp <- function(x) distm(x, fun = distVincentySphere)
dist_vel <- function(x) distm(x, fun = distVincentyEllipsoid)
dist_mee <- function(x) distm(x, fun = distMeeus)

microbenchmark::microbenchmark(
  fn(dist_geo),
  fn(dist_cos),
  fn(dist_hav),
  fn(dist_vsp),
  fn(dist_vel),
  fn(dist_mee),
  times = 1000L
)
# Unit: milliseconds
#          expr        min         lq       mean     median         uq       max neval
#  fn(dist_geo)   6.143276   6.291737   6.718329   6.362257   6.459345  45.91131  1000
#  fn(dist_cos)   4.239236   4.399977   4.918079   4.461804   4.572033  45.70233  1000
#  fn(dist_hav)   4.005331   4.156067   4.641016   4.210721   4.307542  41.91619  1000
#  fn(dist_vsp)   3.827227   3.979829   4.446428   4.033621   4.123924  44.29160  1000
#  fn(dist_vel) 129.712069 132.549638 135.006170 133.935479 135.248135 174.88874  1000
#  fn(dist_mee)   3.716814   3.830999   4.234231   3.883582   3.962712  42.12947  1000

Imap::gdist似乎是Vincenty椭球距离的纯R实现。这可能是造成其速度缓慢的原因...

最后的一些备注:

  • find_nearest函数中的dist参数不需要构建在geosphere距离上。您可以传递任何满足我上面所说的约束条件的函数。
  • find_nearest可以被调整为接受返回类为"dist"对象的函数dist。这些对象仅存储n*(n-1)/2个低三角元素,即n×n对称距离矩阵的下三角部分(参见?dist)。这将改善对大n的支持,并使find_nearest@dww在评论中提出的这种内存有效的Haversine距离实现兼容。只需解决如何转换诸如apply(D, 2L, which.min)之类的操作即可。[更新:我已经在另一个回答中实现了这个对find_nearest的改变,我提供了新的基准测试结果。]

1
idvar 可以是 names(data) 的任何子集。如果您使用 data 包含 citylsad(除了 longitudelatitude),并执行 res <- find_nearest(data, dist, idvar = c("city", "lsad")),那么 res 应该包含 city_nearestlsad_nearest - Mikael Jagan
1
我已经编辑了我的答案,使find_nearest函数稍微更加通用和有用于其他人。这个更改不会破坏答案中的任何代码。(我只是用单个参数coordvar替换了参数lonvarlatvar。) - Mikael Jagan
1
这是您的意思吗?dd %>% mutate(row = seq_len(nrow(.))) %>% group_by(geo_block, ansi_block) %>% group_modify(~find_nearest(., dist = dist_vel, idvar = "row")) - Mikael Jagan
当对完整数据使用时,会生成错误消息:错误:在列行上使用mutate()存在问题 - jvalenti
1
也许值得尝试一下我另一个答案中的 find_nearest2。在我的机器上,它能够在不到两分钟的时间内处理40000个经纬度对,甚至不需要分割-应用-合并。 - Mikael Jagan
显示剩余5条评论

3

你的问题与R:更快地计算大型距离矩阵的方法非常相似。其中一种选择是使用spatialrisk::haversine():


library(spatialrisk)
library(data.table)
library(optiRum)
library(sf)
#> Linking to GEOS 3.9.1, GDAL 3.2.3, PROJ 7.2.1
library(s2)

# Create data.table
s_dt = data.table::data.table(city = somewhere$city,
                              x = somewhere$latitude, 
                              y = somewhere$longitude)

# Cross join two data tables
coordinates_dt <- optiRum::CJ.dt(s_dt, s_dt)
distance_m <- coordinates_dt[, dist_m := spatialrisk::haversine(y, x, i.y, i.x)][
  dist_m > 0, .SD[which.min(dist_m)], by = .(city, x, y)]
head(distance_m)
#>                   city        x          y               i.city      i.x
#> 1:  warminster heights 40.18837  -75.08409               shiloh 39.46242
#> 2: san juan capistrano 33.50089 -117.65439 palos verdes estates 33.77427
#> 3:         littlestown 39.74517  -77.08921          lower allen 40.22675
#> 4:       port republic 39.53480  -74.47610           rio grande 39.01905
#> 5:              taylor 30.57326  -97.42712               aurora 33.05594
#> 6:             merriam 39.01761  -94.69396              leawood 38.90726
#>           i.y    dist_m
#> 1:  -75.29244 31059.909
#> 2: -118.42575 87051.444
#> 3:  -76.90277 24005.689
#> 4:  -74.87787 47227.822
#> 5:  -97.50961 37074.461
#> 6:  -94.62524  7714.126

您还可以使用出色的sf包。例如,sf :: st_nearest_feature()提供:
s_sf <- sf::st_as_sf(s_dt, coords = c("x", "y"))
cities <- sf::st_as_sfc(s2::as_s2_geography(s_sf))
s_dt$city_nearest <-  s_dt$city[sf::st_nearest_feature(cities)]

比较两种方法的速度:

method_1 <- function(){
  coordinates_dt[, dist_m := spatialrisk::haversine(y, x, i.y, i.x)][
    dist_m > 0, .SD[which.min(dist_m)], by = .(city, x, y)]
}

method_2 <- function(){
  s_dt$city[sf::st_nearest_feature(cities)]
}

microbenchmark::microbenchmark(
  method_1(), 
  method_2(), 
  times = 100
)
#> Unit: milliseconds
#>        expr        min         lq       mean     median         uq       max
#>  method_1()   5.385391   5.652444   6.234329   5.772923   6.003445  11.60981
#>  method_2() 182.730850 188.408202 203.348667 199.049937 211.682795 303.14904
#>  neval
#>    100
#>    100

本示例创建于2021-11-14,使用reprex包 v2.0.1。


1
我的初步回答末尾留下的评论后续,我想分享一个更强大的find_nearest版本,它还接受返回"dist"类对象的函数dist。这些对象是双倍向量,仅存储n*(n-1)/2个非冗余元素的n-by-n距离矩阵(参见?dist)。当我们使用"dist"对象而不是矩阵时,可以同时考虑的经度-纬度对的最大数量从N增加到大约sqrt(2)*N,而不会遇到内存限制。
如果您跳到结尾,那么您会发现使用find_nearest2和@dww的快速calc_dist函数(参见here),它返回"dist"对象,可以让我在不到两分钟的时间内处理n=40000个未分组的经度-纬度对。 find_nearest2依赖于类"dist"的子集方法的存在,该方法对"dist"对象x像对应的矩阵一样进行操作,而不构造这些矩阵。更准确地说,x[i, j]应该返回as.matrix(x)[i, j]的值,而不构造as.matrix(x)。幸运的是,对于我们的目的,这种优化只在两种特殊情况下是必要的:(i)列提取和(ii)使用矩阵索引进行元素提取。
`[.dist` <- function(x, i, j, drop = TRUE) {
  class(x) <- NULL
  p <- length(x)
  n <- as.integer(round(0.5 * (1 + sqrt(1 + 8 * p)))) # p = n * (n - 1) / 2
  
  ## Column extraction
  if (missing(i) && !missing(j) && is.integer(j) && length(j) == 1L && !is.na(j) && j >= 1L && j <= n) {
    if (j == 1L) {
      return(c(0, x[seq_len(n - 1L)]))
    }
    ## Convert 2-ary index of 'D' to 1-ary index of 'D[lower.tri(D)]'
    ii <- rep.int(j - 1L, j - 1L)
    jj <- 1L:(j - 1L)
    if (j < n) {
      ii <- c(ii, j:(n - 1L))
      jj <- c(jj, rep.int(j, n - j))
    }
    kk <- ii + round(0.5 * (2L * (n - 1L) - jj) * (jj - 1L))
    ## Extract
    res <- double(n)
    res[-j] <- x[kk]
    nms <- attr(x, "Labels")
    if (drop) {
      names(res) <- nms
    } else {
      dim(res) <- c(n, 1L)
      dimnames(res) <- list(nms, nms[j])
    }
    return(res)
  }
  
  ## Element extraction with matrix indices
  if (missing(j) && !missing(i) && is.matrix(i) && dim(i)[2L] == 2L && is.integer(i) && !anyNA(i) && all(i >= 1L & i <= n)) {
    m <- dim(i)[1L]
    ## Subset off-diagonal entries
    d <- i[, 1L] == i[, 2L]
    i <- i[!d, , drop = FALSE]
    ## Transpose upper triangular entries
    u <- i[, 2L] > i[, 1L]
    i[u, 1:2] <- i[u, 2:1]
    ## Convert 2-ary index of 'D' to 1-ary index of 'D[lower.tri(D)]'
    ii <- i[, 1L] - 1L
    jj <- i[, 2L]
    kk <- ii + (jj > 1L) * round(0.5 * (2L * (n - 1L) - jj) * (jj - 1L))
    ## Extract
    res <- double(m)
    res[!d] <- x[kk] 
    return(res)
  }

  ## Fall back on coercion for any other subset operation
  as.matrix(x)[i, j, drop = drop]
}

这是一个简单的测试。
n <- 6L
do <- dist(seq_len(n))
dm <- unname(as.matrix(do))
ij <- cbind(sample(6L), sample(6L))
identical(do[, 4L], dm[, 4L]) # TRUE
identical(do[ij], dm[ij]) # TRUE

这里是find_nearest2函数。请注意,该函数中应用了子集操作符[来访问对象D的元素。由于我们在子集方法中进行了优化,因此这些操作都不需要将"dist"转换为"matrix"类型。
find_nearest2 <- function(data, dist, coordvar, idvar) {
  m <- match(coordvar, names(data), 0L)
  n <- nrow(data)
  if (n < 2L) {
    argmin <- NA_integer_[n]
    distance <- NA_real_[n]
  } else {
    ## Compute distance matrix
    D <- dist(data[m])
    ## Extract minimum off-diagonal distances
    patch.which.min <- function(x, i) {
      x[i] <- Inf
      which.min(x)
    }
    argmin <- integer(n)
    index <- seq_len(n)
    for (j in index) {
      argmin[j] <- forceAndCall(2L, patch.which.min, D[, j], j)
    }
    distance <- D[cbind(argmin, index)]
  }
  ## Return focal point data, nearest neighbour ID, distance
  r1 <- data[-m]
  r2 <- data[argmin, idvar, drop = FALSE]
  names(r2) <- paste0(idvar, "_nearest")
  data.frame(r1, r2, distance, row.names = NULL, stringsAsFactors = FALSE)
}

让我们获取@dww的C++代码,以便在我们的R会话中可以使用calc_dist
code <- '#include <Rcpp.h>
using namespace Rcpp;

double distanceHaversine(double latf, double lonf, double latt, double lont, double tolerance) {
  double d;
  double dlat = latt - latf;
  double dlon =  lont - lonf;
  d = (sin(dlat * 0.5) * sin(dlat * 0.5)) + (cos(latf) * cos(latt)) * (sin(dlon * 0.5) * sin(dlon * 0.5));
  if(d > 1 && d <= tolerance){
    d = 1;
  }
  return 2 * atan2(sqrt(d), sqrt(1 - d)) * 6378137.0;
}

double toRadians(double deg){
  return deg * 0.01745329251;  // PI / 180;
}

// [[Rcpp::export]]
NumericVector calc_dist(Rcpp::NumericVector lat, 
                        Rcpp::NumericVector lon, 
                        double tolerance = 10000000000.0) {
  std::size_t nlat = lat.size();
  std::size_t nlon = lon.size();
  if (nlat != nlon) throw std::range_error("lat and lon different lengths");
  if (nlat < 2) throw std::range_error("Need at least 2 points");
  std::size_t size = nlat * (nlat - 1) / 2;
  NumericVector ans(size);
  std::size_t k = 0;
  double latf;
  double latt;
  double lonf;
  double lont;
  
  for (std::size_t j = 0; j < (nlat-1); j++) {
    for (std::size_t i = j + 1; i < nlat; i++) {
      latf = toRadians(lat[i]);
      lonf = toRadians(lon[i]);
      latt = toRadians(lat[j]);
      lont = toRadians(lon[j]);
      ans[k++] = distanceHaversine(latf, lonf, latt, lont, tolerance);
    }
  }
  
  return ans;
}
'
Rcpp::sourceCpp(code = code)

让我们比较 find_nearest2n 分别等于 100、20000 和 40000 时的性能,并且使用两个函数 dist 进行比较,一个使用 geosphere::distm 构建完整的距离矩阵,另一个使用 @dww 的 calc_dist 构建一个 "dist" 对象。这两个函数都计算 Haversine 距离。
rx <- function(n) {
  data.frame(id = seq_len(n), lon = rnorm(n), lat = rnorm(n))
}
dist_hav <- function(x) {
  geosphere::distm(x, fun = geosphere::distHaversine)
}
dist_dww <- function(x) {
  res <- calc_dist(x[, 2L], x[, 1L])
  attr(res, "class") <- "dist"
  attr(res, "Size") <- nrow(x)
  attr(res, "Diag") <- FALSE
  attr(res, "Upper") <- FALSE
  attr(res, "call") <- match.call()
  res
}

这是基准测试:
fn2 <- function(data, dist) {
  find_nearest2(data, dist = dist, coordvar = c("lon", "lat"), idvar = "id")
}

x1 <- rx(100L)
microbenchmark::microbenchmark(
  fn2(x1, dist_hav), 
  fn2(x1, dist_dww), 
  times = 1000L
)
# Unit: microseconds
#               expr      min       lq     mean   median       uq       max neval
#  fn2(x1, dist_hav) 3768.310 3886.452 4680.300 3977.492 4131.796 34461.361  1000
#  fn2(x1, dist_dww)  930.044  992.241 1128.272 1017.005 1045.746  7006.326  1000

x2 <- rx(20000L)
microbenchmark::microbenchmark(
  fn2(x2, dist_hav),
  fn2(x2, dist_dww),
  times = 100L
)
# Unit: seconds
#               expr      min       lq     mean   median       uq      max neval
#  fn2(x2, dist_hav) 29.60596 30.04249 30.29052 30.14016 30.45054 31.53976   100
#  fn2(x2, dist_dww) 18.71327 19.01204 19.12311 19.09058 19.26680 19.62273   100

x3 <- rx(40000L)
microbenchmark::microbenchmark(
  # fn2(x3, dist_hav), # runs out of memory
  fn2(x3, dist_dww),
  times = 10L
)
# Unit: seconds
#               expr      min      lq     mean  median       uq      max neval
#  fn2(x3, dist_dww) 104.8912 105.762 109.1512 109.653 112.2543 112.9265    10

因此,在我的机器上,如果应用于您的完整、未分组的数据集(n 约为32000),find_nearest2(dist = dist_dww) 至少会在不到两分钟的时间内完成,而且不会耗尽内存。

0

我有一个解决方案。而且我自己都很惊讶,我用了两个循环for!! 不可思议,我做到了。首先要做的事情。

我的建议是基于简化。然而,在短距离内你会犯的错误相对较小。但时间节省是巨大的!

好吧,我建议用笛卡尔坐标来计算距离,而不是球面坐标。

因此,我们需要一个简单的函数,根据两个参数latitudelongitude计算笛卡尔坐标。这就是我们的LatLong2Cart特性。

LatLong2Cart = function(latitude, longitude, REarth = 6371) { 
  tibble(
    x = REarth * cos(latitude) * cos(longitude),
    y = REarth * cos(latitude) * sin(longitude),
    z = REarth *sin(latitude))
}

首先简单说明一下,我的函数计算距离的单位是公里(毕竟这是国际单位制)。如果你想要使用其他单位比如英里、码、英尺等,可以通过输入半径来进行转换。
现在让我们看看它是如何工作的。但是在此之前,让我先将你的data.frame转换为tibble
library(tidyverse)
somewhere = somewhere %>% as_tibble()

somewhere %>%  
  mutate(LatLong2Cart(latitude, longitude))

输出

# A tibble: 100 x 12
   state   geoid ansicode city                lsad  funcstat latitude longitude designation      x      y      z
   <fct>   <int>    <int> <chr>               <chr> <chr>       <dbl>     <dbl> <chr>        <dbl>  <dbl>  <dbl>
 1 or    4120100  2410344 donald              25    a            45.2    -123.  city        -1972.   644.  6024.
 2 pa    4280962  2390453 warminster heights  57    s            40.2     -75.1 cdp         -4815. -1564.  3867.
 3 ca     668028  2411793 san juan capistrano 25    a            33.5    -118.  city          485. -3096.  5547.
 4 pa    4243944  1214759 littlestown         21    a            39.7     -77.1 borough       350.  2894.  5665.
 5 nj    3460600   885360 port republic       25    a            39.5     -74.5 city        -1008. -1329.  6149.
 6 tx    4871948  2412035 taylor              25    a            30.6     -97.4 city        -4237.   160. -4755.
 7 ks    2046000   485621 merriam             25    a            39.0     -94.7 city         1435.  -686.  6169.
 8 nc    3747695  2403359 northlakes          57    s            35.8     -81.4 cdp         -2066.  -670. -5990.
 9 co     839965  2412812 julesburg           43    a            41.0    -102.  town         1010.  6223.  -915.
10 ia    1909910  2583481 california junction 57    s            41.6     -96.0 cdp           840.  4718. -4198.
# ... with 90 more rows

一旦我们有笛卡尔坐标,让我们计算适当的距离。我为此准备了calcDist函数。

calcDist = function(data, key){
  if(!all(c("x", "y", "z") %in% names(data))) {
    stop("date must contain the variables x, y and z!")
  }
  key=enquo(key)
  n = nrow(data)
  dist = array(data = as.double(0), dim=n*n)
  x = data$x
  y = data$y
  z = data$z
  keys = data %>% pull(!!key)
  for(i in 1:n){
    for(j in 1:n){
      dist[i+(j-1)*n] = sqrt((x[i]-x[j])^2+(y[i]-y[j])^2+(z[i]-z[j])^2)
    }
  }
  tibble(
    expand.grid(factor(keys), factor(keys)),
    dist = dist
    )
}

无论如何,这就是地方。在这里我使用了这两个for循环。希望能得到原谅!

好的,让我们看看这些循环是否能做些什么。

somewhere %>%  
  mutate(LatLong2Cart(latitude, longitude)) %>% 
  calcDist(city)

输出

# A tibble: 10,000 x 3
   Var1                Var2     dist
   <fct>               <fct>   <dbl>
 1 donald              donald     0 
 2 warminster heights  donald  4197.
 3 san juan capistrano donald  4500.
 4 littlestown         donald  3253.
 5 port republic       donald  2200.
 6 taylor              donald 11025.
 7 merriam             donald  3660.
 8 northlakes          donald 12085.
 9 julesburg           donald  9390.
10 california junction donald 11358.
# ... with 9,990 more rows

正如您所看到的,一切都很顺利。好的,但是如何在分组数据上使用它呢?这并不难。让我们按州进行分组。

somewhere %>% 
  mutate(LatLong2Cart(latitude, longitude)) %>% 
  nest_by(state) %>% 
  mutate(dist = list(calcDist(data, city))) %>% 
  select(-data) %>% 
  unnest(dist)

输出

# A tibble: 400 x 4
# Groups:   state [40]
   state Var1                      Var2                        dist
   <fct> <fct>                     <fct>                      <dbl>
 1 ak    ester                     ester                         0 
 2 ak    unalaska                  ester                      9245.
 3 ak    ester                     unalaska                   9245.
 4 ak    unalaska                  unalaska                      0 
 5 al    heath                     heath                         0 
 6 al    huntsville                heath                     12597.
 7 al    heath                     huntsville                12597.
 8 al    huntsville                huntsville                    0 
 9 ar    stamps                    stamps                        0 
10 az    orange grove mobile manor orange grove mobile manor     0 
# ... with 390 more rows

这里可能存在一些数据冗余(例如从城市A到城市B的距离和从城市B到城市A的距离),但整个实现过程相对容易,您自己也可能会认同这一点。

好的。现在是时候看看当我们按照geo_blockansi_block进行分组时它将如何工作了。

somewhere %>% 
  mutate(LatLong2Cart(latitude, longitude)) %>% 
  mutate(
    geo_block = substr(geoid, 1, 1),
    ansi_block = substr(ansicode, 1, 1)) %>% 
  nest_by(geo_block, ansi_block) %>% 
  mutate(dist = list(calcDist(data, city))) %>% 
  select(-data) %>% 
  unnest(dist)

输出

# A tibble: 1,716 x 5
# Groups:   geo_block, ansi_block [13]
   geo_block ansi_block Var1                Var2                  dist
   <chr>     <chr>      <fct>               <fct>                <dbl>
 1 1         2          california junction california junction     0 
 2 1         2          pleasantville       california junction 10051.
 3 1         2          willacoochee        california junction 11545.
 4 1         2          effingham           california junction 11735.
 5 1         2          heath               california junction  4097.
 6 1         2          middle amana        california junction  7618.
 7 1         2          new baden           california junction 12720.
 8 1         2          hannahs mill        california junction 11681.
 9 1         2          germantown hills    california junction  5097.
10 1         2          la fontaine         california junction 11397.
# ... with 1,706 more rows

正如您所看到的,这没有任何问题。

最后,一个实用的提示。当您使用此解决方案时,请确保分组变量不超过约20,000行。这可能会导致对于如此大的数据而言内存分配问题。请注意,此时您的结果将是20,000 * 20,000行,这相当多,您可能会在某些时候耗尽内存。虽然计算不应该超过一分钟。

请检查它在您的数据上的运作情况并给出一些反馈。我也希望您喜欢我的解决方案,并且笛卡尔坐标计算中产生的距离误差不会以任何特定方式困扰您。


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接