我已经从人口普查局下载了美国所有城镇和城市等的列表。以下是一个随机样本:
dput(somewhere)
structure(list(state = structure(c(30L, 31L, 5L, 31L, 24L, 36L,
13L, 21L, 6L, 10L, 31L, 28L, 10L, 5L, 5L, 8L, 23L, 11L, 34L,
19L, 29L, 4L, 24L, 13L, 21L, 31L, 2L, 3L, 29L, 24L, 1L, 13L,
15L, 10L, 11L, 33L, 35L, 8L, 11L, 12L, 36L, 28L, 9L, 31L, 8L,
14L, 11L, 12L, 36L, 13L, 8L, 5L, 29L, 8L, 7L, 23L, 25L, 39L,
16L, 28L, 10L, 29L, 26L, 8L, 32L, 40L, 28L, 23L, 37L, 31L, 18L,
5L, 1L, 31L, 18L, 13L, 11L, 10L, 25L, 18L, 21L, 18L, 11L, 35L,
31L, 36L, 20L, 19L, 38L, 2L, 40L, 13L, 36L, 11L, 29L, 27L, 22L,
17L, 12L, 20L), .Label = c("ak", "al", "ar", "az", "ca", "co",
"fl", "ga", "hi", "ia", "il", "in", "ks", "ky", "la", "md", "mi",
"mo", "ms", "mt", "nc", "nd", "ne", "nj", "nm", "nv", "ny", "oh",
"ok", "or", "pa", "pr", "ri", "sc", "sd", "tx", "ut", "va", "wa",
"wi"), class = "factor"), geoid = c(4120100L, 4280962L, 668028L,
4243944L, 3460600L, 4871948L, 2046000L, 3747695L, 839965L, 1909910L,
4244824L, 3902204L, 1963750L, 622360L, 669088L, 1382972L, 3125230L,
1722736L, 4539265L, 2804705L, 4039625L, 451465L, 3467020L, 2077150L,
3765260L, 4221792L, 133904L, 566320L, 4033150L, 3463180L, 223460L,
2013675L, 2232405L, 1951600L, 1752142L, 4445010L, 4655684L, 1336416L,
1729080L, 1840842L, 4804672L, 3932207L, 1523675L, 4260384L, 1321912L,
2159232L, 1735307L, 1867176L, 4839304L, 2057350L, 1309656L, 655380L,
4082250L, 1350680L, 1275475L, 3147745L, 3505010L, 5352285L, 2483337L,
3909834L, 1912945L, 4068200L, 3227900L, 1366304L, 7286831L, 5505350L,
3982390L, 3149915L, 4974480L, 4249440L, 2943346L, 677430L, 280770L,
4247872L, 2902242L, 2039075L, 1735281L, 1932565L, 3580120L, 2973852L,
3722620L, 2943238L, 1755938L, 4643100L, 4251904L, 4830920L, 3056575L,
2801940L, 5156048L, 137000L, 5508925L, 2057300L, 4861172L, 1736477L,
4021200L, 3677783L, 3832060L, 2614900L, 1820332L, 3043750L),
ansicode = c(2410344L, 2390453L, 2411793L, 1214759L, 885360L,
2412035L, 485621L, 2403359L, 2412812L, 2583481L, 2390095L,
2397971L, 2396237L, 2585422L, 2411819L, 2405746L, 2398338L,
2394628L, 2812929L, 2586582L, 2408478L, 2582836L, 885393L,
2397270L, 2402898L, 2584453L, 2405811L, 2405518L, 2412737L,
2389752L, 2418574L, 2393549L, 2402559L, 2629970L, 2399453L,
2378109L, 2812999L, 2402563L, 2398956L, 2396699L, 2409759L,
2393028L, 2414061L, 2805542L, 2404192L, 2404475L, 2398514L,
2629884L, 2408486L, 2396265L, 2405306L, 2411363L, 2413515L,
2405064L, 2402989L, 2583899L, 2629103L, 2585016L, 2390487L,
2397481L, 2393811L, 2413298L, 2583927L, 2812702L, 2415078L,
1582764L, 2400116L, 2400036L, 2412013L, 2633665L, 2787908L,
2583158L, 2418866L, 1214943L, 2393998L, 485611L, 2398513L,
2394969L, 2806756L, 2397053L, 2406485L, 2395719L, 2399572L,
1267480L, 2389516L, 2410660L, 2409026L, 2806379L, 2584894L,
2404746L, 2586459L, 2396263L, 2411528L, 2398556L, 2412443L,
2584298L, 1036064L, 2806333L, 2396920L, 2804282L), city = c("donald",
"warminster heights", "san juan capistrano", "littlestown",
"port republic", "taylor", "merriam", "northlakes", "julesburg",
"california junction", "lower allen", "antwerp", "pleasantville",
"el rancho", "santa clarita", "willacoochee", "kennard",
"effingham", "la france", "beechwood", "keys", "orange grove mobile manor",
"shiloh", "west mineral", "stony point", "east salem", "heath",
"stamps", "haworth", "rio grande", "ester", "clayton", "hackberry",
"middle amana", "new baden", "melville", "rolland colony",
"hannahs mill", "germantown hills", "la fontaine", "aurora",
"green meadows", "kaiminani", "pinecroft", "dawson", "park city",
"hinsdale", "st. meinrad", "kingsland", "powhattan", "bowersville",
"palos verdes estates", "wyandotte", "meigs", "waverly",
"sunol", "arroyo hondo", "outlook", "west pocomoke", "buchtel",
"chatsworth", "smith village", "glenbrook", "rock spring",
"villalba", "bayfield", "waynesfield", "utica", "sunset",
"milford square", "lithium", "swall meadows", "unalaska",
"martinsburg", "ashland", "leawood", "hindsboro", "gray",
"turley", "trimble", "falcon", "linn", "olympia fields",
"mitchell", "mount pleasant mills", "greenville", "park city",
"arkabutla", "new river", "huntsville", "boulder junction",
"potwin", "red lick", "huey", "dougherty", "wadsworth", "grand forks",
"chassell", "edgewood", "lindsay"), lsad = c("25", "57",
"25", "21", "25", "25", "25", "57", "43", "57", "57", "47",
"25", "57", "25", "25", "47", "25", "57", "57", "57", "57",
"21", "25", "57", "57", "43", "25", "43", "57", "57", "25",
"57", "57", "47", "57", "57", "57", "47", "43", "25", "57",
"57", "57", "25", "25", "47", "57", "57", "25", "43", "25",
"43", "25", "57", "57", "57", "57", "57", "47", "25", "43",
"57", "57", "62", "25", "47", "47", "25", "57", "57", "57",
"25", "21", "25", "25", "47", "25", "57", "25", "43", "25",
"47", "25", "57", "25", "57", "57", "57", "25", "57", "25",
"25", "47", "43", "57", "25", "57", "43", "57"), funcstat = c("a",
"s", "a", "a", "a", "a", "a", "s", "a", "s", "s", "a", "a",
"s", "a", "a", "a", "a", "s", "s", "s", "s", "a", "a", "s",
"s", "a", "a", "a", "s", "s", "a", "s", "s", "a", "s", "s",
"s", "a", "a", "a", "s", "s", "s", "a", "a", "a", "s", "s",
"a", "a", "a", "a", "a", "s", "s", "s", "s", "s", "a", "a",
"a", "s", "s", "s", "a", "a", "a", "a", "s", "s", "s", "a",
"a", "a", "a", "a", "a", "s", "a", "a", "a", "a", "a", "s",
"a", "s", "s", "s", "a", "s", "a", "a", "a", "a", "s", "a",
"s", "a", "s"), latitude = c(45.221487, 40.18837, 33.500889,
39.74517, 39.534798, 30.573263, 39.017607, 35.780523, 40.984864,
41.56017, 40.226748, 41.180176, 41.387011, 36.220684, 34.414083,
31.335094, 41.474697, 39.120662, 34.616281, 32.336723, 35.802786,
32.598451, 39.462418, 37.283906, 35.867809, 40.608713, 31.344839,
33.354959, 33.840898, 39.019051, 64.879056, 39.736866, 29.964958,
41.794765, 38.536765, 41.559549, 44.3437, 32.937302, 40.768954,
40.673893, 33.055942, 39.867193, 19.757709, 40.564189, 31.771864,
37.093499, 41.800683, 38.168142, 30.666141, 39.761734, 34.373065,
33.774271, 36.807143, 31.071788, 27.985282, 41.154105, 36.534599,
46.331153, 38.096527, 39.463511, 42.916301, 35.45079, 39.100123,
34.81467, 18.127809, 46.81399, 40.602442, 40.895279, 41.13806,
40.433182, 37.831844, 37.50606, 53.910255, 40.310917, 38.812464,
38.907263, 39.684775, 41.841711, 36.736661, 39.476152, 35.194804,
38.478798, 41.521996, 43.730057, 40.724697, 33.111939, 45.630946,
34.700227, 37.142945, 34.782275, 46.1148, 37.938624, 33.485081,
38.605285, 34.399808, 42.821447, 47.921291, 47.036116, 40.103208,
47.224885), longitude = c(-122.837813, -75.084089, -117.654388,
-77.089213, -74.476099, -97.427116, -94.693955, -81.367835,
-102.262708, -95.994752, -76.902769, -84.736099, -93.272787,
-119.068357, -118.494729, -83.044003, -96.203696, -88.550859,
-82.770697, -90.808692, -94.941358, -114.660588, -75.29244,
-94.926801, -81.044121, -77.23694, -86.46905, -93.497879,
-94.657035, -74.87787, -148.041153, -100.176484, -93.410178,
-91.901539, -89.707193, -71.301933, -96.59226, -84.340945,
-89.462982, -85.722023, -97.509615, -83.945334, -156.001765,
-78.353464, -84.443499, -86.048077, -87.928172, -86.832128,
-98.454026, -95.634011, -83.084305, -118.425754, -94.729305,
-84.092683, -81.625304, -102.762746, -105.666602, -120.092812,
-75.579197, -82.180426, -96.514499, -97.457006, -119.927289,
-85.238869, -66.481897, -90.822546, -83.973881, -97.345349,
-112.028388, -75.405024, -89.88325, -118.642656, -166.529029,
-78.324286, -92.239531, -94.62524, -88.134729, -94.985863,
-107.792147, -94.561898, -78.65389, -91.844989, -87.691648,
-98.029974, -77.026451, -96.110256, -108.925311, -90.121565,
-80.595817, -86.532599, -89.654438, -97.01835, -94.161474,
-89.289973, -97.05148, -77.893875, -97.08933, -88.530745,
-85.737461, -105.152791), designation = c("city", "cdp",
"city", "borough", "city", "city", "city", "cdp", "town",
"cdp", "cdp", "village", "city", "cdp", "city", "city", "village",
"city", "cdp", "cdp", "cdp", "cdp", "borough", "city", "cdp",
"cdp", "town", "city", "town", "cdp", "cdp", "city", "cdp",
"cdp", "village", "cdp", "cdp", "cdp", "village", "town",
"city", "cdp", "cdp", "cdp", "city", "city", "village", "cdp",
"cdp", "city", "town", "city", "town", "city", "cdp", "cdp",
"cdp", "cdp", "cdp", "village", "city", "town", "cdp", "cdp",
"urbana", "city", "village", "village", "city", "cdp", "cdp",
"cdp", "city", "borough", "city", "city", "village", "city",
"cdp", "city", "town", "city", "village", "city", "cdp",
"city", "cdp", "cdp", "cdp", "city", "cdp", "city", "city",
"village", "town", "cdp", "city", "cdp", "town", "cdp")), row.names = c(22769L,
24845L, 3314L, 24015L, 17360L, 28139L, 10085L, 19881L, 3886L,
8750L, 24027L, 20585L, 9362L, 2499L, 3333L, 6041L, 16321L, 6847L,
25249L, 14051L, 22233L, 1210L, 17425L, 10353L, 20053L, 23545L,
253L, 1951L, 22166L, 17386L, 685L, 9771L, 11134L, 9225L, 7386L,
25001L, 25862L, 5663L, 6950L, 8239L, 26555L, 20991L, 6108L, 24388L,
5551L, 10772L, 7056L, 8470L, 27292L, 10202L, 5451L, 3116L, 22660L,
5776L, 5317L, 16546L, 17582L, 29958L, 12103L, 20709L, 8779L,
22515L, 16665L, 5902L, 31901L, 30658L, 21745L, 16574L, 28632L,
24127L, 15046L, 3455L, 930L, 24087L, 14494L, 10016L, 7055L, 8993L,
18048L, 15434L, 19615L, 15043L, 7454L, 25775L, 24194L, 27115L,
15857L, 14038L, 29305L, 276L, 30693L, 10201L, 27863L, 7075L,
22046L, 19267L, 20311L, 12502L, 8093L, 15798L), class = "data.frame")
我想使用longitude
和latitude
列以及gdist
函数计算city
列中每个条目之间的距离。我知道以下的for
循环可以工作并且易于阅读:
dist_list <- list()
for (i in 1:nrow(somewhere)) {
dist_list[[i]] <- gdist(lon.1 = somewhere$longitude[i],
lat.1 = somewhere$latitude[i],
lon.2 = somewhere$longitude,
lat.2 = somewhere$latitude,
units="miles")
}
然而:在完整数据集(31K+行)上运行时间非常长,需要几个小时。我正在寻找能够加速计算的方法。我认为采用“分割-应用-合并”方法中的一些方式会很好,因为最终我想涉及一对分组变量,即“geo_block”和“ansi_block”,但实际上任何比目前方法更好的方法都可以。
我尝试了以下方法:
somewhere$geo_block <- substr(somewhere$geoid, 1, 1)
somewhere$ansi_block <- substr(somewhere$ansicode, 1, 1)
somewhere <- somewhere %>%
split(.$geo_block, .$ansi_block) %>%
mutate(dist = gdist(longlat = somewhere[, c("longitude", "latitude")]))
但我不确定如何在标准for
循环之外指定第二组经纬度输入。
我的问题:
- 如何使用
split-apply-combine
方法解决这个问题,其中将geo_block
和ansi_block
作为分组变量,如上所述? 我想返回最短距离以及对应于此距离的city
的名称和geo_block
的值。
欢迎所有建议。理想情况下,期望的结果会相当快,因为我实际处理的数据集非常大。由于我有点迷茫,所以我在问题中添加了一些奖励,以产生更多的兴趣,并希望能得到广泛的潜在答案,让我能够学习。非常感谢!
state
作为分组变量?有些州会比其他州拥有更多的位置。相反,你应该使用均等大小的分组,比如每500或1000行。 - Gregor Thomasgdist
зҡ„ж•ҲзҺҮеҰӮдҪ•пјҢдҪҶйӮЈйҮҢзҡ„дёҖдёӘзӯ”жЎҲжҳҫзӨәspatialrisk::haversine
жҜ”geosphere::distm
еҝ«дёҖдёӘж•°йҮҸзә§гҖӮ - Gregor Thomas