GeoPandas：他のデータフレームで最も近いポイントを見つける

20

2つのジオデータフレームがあります。

import geopandas as gpd
from shapely.geometry import Point
gpd1 = gpd.GeoDataFrame([['John',1,Point(1,1)],['Smith',1,Point(2,2)],['Soap',1,Point(0,2)]],columns=['Name','ID','geometry'])
gpd2 = gpd.GeoDataFrame([['Work',Point(0,1.1)],['Shops',Point(2.5,2)],['Home',Point(1,1.1)]],columns=['Place','geometry'])

そして、gpd1の各行について、gpd2の最も近いポイントの名前を見つけたいです。

desired_output = 

    Name  ID     geometry  Nearest
0   John   1  POINT (1 1)     Home
1  Smith   1  POINT (2 2)    Shops
2   Soap   1  POINT (0 2)     Work

私はラムダ関数を使用してこれを機能させようとしています：

gpd1['Nearest'] = gpd1.apply(lambda row: min_dist(row.geometry,gpd2)['Place'] , axis=1)

と

def min_dist(point, gpd2):

    geoseries = some_function()
    return geoseries

— RedM
ソース

このメソッドは私のために働いた：stackoverflow.com/questions/37402046 / ... リンクを見てください

— ジョニー・チーシェッター

16

Shapely関数Nearestポイントを直接使用できます（GeoSeriesのジオメトリはShapelyジオメトリです）：

from shapely.ops import nearest_points
# unary union of the gpd2 geomtries 
pts3 = gpd2.geometry.unary_union
def near(point, pts=pts3):
     # find the nearest point and return the corresponding Place value
     nearest = gpd2.geometry == nearest_points(point, pts)[1]
     return gpd2[nearest].Place.get_values()[0]
gpd1['Nearest'] = gpd1.apply(lambda row: near(row.geometry), axis=1)
gpd1
    Name  ID     geometry  Nearest
0   John   1  POINT (1 1)     Home
1  Smith   1  POINT (2 2)    Shops
2   Soap   1  POINT (0 2)     Work

解明

for i, row in gpd1.iterrows():
    print nearest_points(row.geometry, pts3)[0], nearest_points(row.geometry, pts3)[1]
 POINT (1 1) POINT (1 1.1)
 POINT (2 2) POINT (2.5 2)
 POINT (0 2) POINT (0 1.1)

— 遺伝子
ソース

何かが私のために機能していないと私はそれを理解することはできません。この関数は、ジオメトリがソリッドであっても空のGeoSeriesを返します。例： sample_point = gpd2.geometry.unary_union[400] / sample_point in gpd2.geometry これはTrueを返します。 gpd2.geometry == sample_point これはすべて偽です。

— ロブロック

上記への追加：gpd2.geometry.geom_equals(sample_point)動作します。

— ロブロック

13

大きなデータフレームがある場合、scipycKDTree空間インデックス.queryメソッドは最近傍検索で非常に高速な結果を返すことがわかりました。空間インデックスを使用するため、データフレームをループしてすべての距離の最小値を見つけるよりも桁違いに高速です。またnearest_points、cKDTreeでは検索をベクトル化できますが、他の方法ではできないので、RTreeでshapelyを使用するよりも高速です（geopandasで利用可能な空間インデックスメソッド）。

のgpd2各ポイントからの距離と最近傍の「名前」を返すヘルパー関数を次に示しgpd1ます。両方のgdfにgeometry（ポイントの）列があると想定しています。

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point

gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)], ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', Point(0, 1.1)], ['Shops', Point(2.5, 2)],
                         ['Home', Point(1, 1.1)]],
                        columns=['Place', 'geometry'])

def ckdnearest(gdA, gdB):
    nA = np.array(list(zip(gdA.geometry.x, gdA.geometry.y)) )
    nB = np.array(list(zip(gdB.geometry.x, gdB.geometry.y)) )
    btree = cKDTree(nB)
    dist, idx = btree.query(nA, k=1)
    gdf = pd.concat(
        [gdA, gdB.loc[idx, gdB.columns != 'geometry'].reset_index(),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

ckdnearest(gpd1, gpd2)

そして、LineStringに最も近いポイントを見つけたい場合、完全な例があります：

import itertools
from operator import itemgetter

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString

gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)],
                         ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', LineString([Point(100, 0), Point(100, 1)])],
                         ['Shops', LineString([Point(101, 0), Point(101, 1), Point(102, 3)])],
                         ['Home',  LineString([Point(101, 0), Point(102, 1)])]],
                        columns=['Place', 'geometry'])


def ckdnearest(gdfA, gdfB, gdfB_cols=['Place']):
    A = np.concatenate(
        [np.array(geom.coords) for geom in gdfA.geometry.to_list()])
    B = [np.array(geom.coords) for geom in gdfB.geometry.to_list()]
    B_ix = tuple(itertools.chain.from_iterable(
        [itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
    B = np.concatenate(B)
    ckd_tree = cKDTree(B)
    dist, idx = ckd_tree.query(A, k=1)
    idx = itemgetter(*idx)(B_ix)
    gdf = pd.concat(
        [gdfA, gdfB.loc[idx, gdfB_cols].reset_index(drop=True),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

c = ckdnearest(gpd1, gpd2)

— JHuw
ソース

この方法を使用して、線上の最も近い点を与えることは可能ですか？たとえば、GPS位置を最も近い通りにスナップするには。

— ハイパーノット

この答えは素晴らしいです！ただし、行に最も近いポイントのコードは、バグを生成します。各ポイントに対して最も近いラインからの正しい距離が返されるようですが、返されるラインIDは間違っています。私はそのidx計算だと思いますが、私はPythonにかなり慣れていないので、頭を包み込むことができません。

— Shakedk

1

理解した：

def min_dist(point, gpd2):
    gpd2['Dist'] = gpd2.apply(lambda row:  point.distance(row.geometry),axis=1)
    geoseries = gpd2.iloc[gpd2['Dist'].argmin()]
    return geoseries

もちろん、批判も歓迎します。私は、gpd1のすべての行に対してgpd2 ['Dist']を再計算するのが好きではありません...

— RedM
ソース

1

ジーンによる答えは私にはうまくいきませんでした。最後に、gpd2.geometry.unary_unionが、合計約150.000ポイントのうち約30.000だけを含むジオメトリを生成することを発見しました。同じ問題に遭遇した他の人のために、私はそれをどのように解決したのですか：

    from shapely.ops import nearest_points
    from shapely.geometry import MultiPoint

    gpd2_pts_list = gpd2.geometry.tolist()
    gpd2_pts = MultiPoint(gpd2_pts_list)
    def nearest(point, gpd2_pts, gpd2=gpd2, geom_col='geometry', src_col='Place'):
         # find the nearest point
         nearest_point = nearest_points(point, gpd2_pts)[1]
         # return the corresponding value of the src_col of the nearest point
         value = gpd2[gpd2[geom_col] == nearest_point][src_col].get_values()[0]
         return value

    gpd1['Nearest'] = gpd1.apply(lambda x: nearest(x.geometry, gpd2_pts), axis=1)

— インスケ
ソース

0

@ JHuwからの優れた回答を使用しているときに、自分のデータでインデックスエラーが発生した場合、私の問題はインデックスが整合しないことでした。gdfAとgdfBのインデックスをリセットすることで私の問題が解決しました。これは@ Shakedkにも役立つかもしれません。

import itertools
from operator import itemgetter

import geopandas as gpd
import numpy as np
import pandas as pd

from scipy.spatial import cKDTree
from shapely.geometry import Point, LineString

gpd1 = gpd.GeoDataFrame([['John', 1, Point(1, 1)],
                         ['Smith', 1, Point(2, 2)],
                         ['Soap', 1, Point(0, 2)]],
                        columns=['Name', 'ID', 'geometry'])
gpd2 = gpd.GeoDataFrame([['Work', LineString([Point(100, 0), Point(100, 1)])],
                         ['Shops', LineString([Point(101, 0), Point(101, 1), Point(102, 3)])],
                         ['Home',  LineString([Point(101, 0), Point(102, 1)])]],
                        columns=['Place', 'geometry'])


def ckdnearest(gdfA, gdfB, gdfB_cols=['Place']):
    # resetting the index of gdfA and gdfB here.
    gdfA = gdfA.reset_index(drop=True)
    gdfB = gdfB.reset_index(drop=True)
    A = np.concatenate(
        [np.array(geom.coords) for geom in gdfA.geometry.to_list()])
    B = [np.array(geom.coords) for geom in gdfB.geometry.to_list()]
    B_ix = tuple(itertools.chain.from_iterable(
        [itertools.repeat(i, x) for i, x in enumerate(list(map(len, B)))]))
    B = np.concatenate(B)
    ckd_tree = cKDTree(B)
    dist, idx = ckd_tree.query(A, k=1)
    idx = itemgetter(*idx)(B_ix)
    gdf = pd.concat(
        [gdfA, gdfB.loc[idx, gdfB_cols].reset_index(drop=True),
         pd.Series(dist, name='dist')], axis=1)
    return gdf

c = ckdnearest(gpd1, gpd2)

— マルクス・ローゼンフェルダー
ソース