Grid-based Exploration of geosocial patterns (Germany) ¶
Alexander Dunkel, Leibniz Institute of Ecological Urban and Regional Development,
Transformative Capacities & Research Data Centre (IÖR-FDZ)
In this notebook, we synthesize further and look at patterns in aggregated grid cells.
Prepare environment¶
To run this notebook, as a starting point, you have two options:
1. Create an environment with the packages and versions shown in the following cell.
As a starting point, you may use the latest conda environment_default.yml from our CartoLab docker container.
2. If docker is available to you, we suggest to use the Carto-Lab Docker Container
Clone the repository and edit your .env
value to point to the repsitory, where this notebook can be found, e.g.:
git clone https://gitlab.vgiscience.de/lbsn/tools/jupyterlab.git
cd jupyterlab
cp .env.example .env
nano .env
## Enter:
# JUPYTER_NOTEBOOKS=~/notebooks/geosocial_patterns_de
# TAG=v0.12.3
docker network create lbsn-network
docker-compose pull && docker-compose up -d
Load dependencies:
import os
import csv
import sys
import colorcet
import psycopg2 # Postgres API
import geoviews as gv
import holoviews as hv
import mapclassify as mc
import geopandas as gp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geoviews.feature as gf
from pathlib import Path
from collections import namedtuple
from pathlib import Path
from typing import List, Tuple, Dict, Optional
from pyproj import Transformer, CRS, Proj
from geoviews import opts
from shapely.geometry import shape, Point, Polygon
from shapely.ops import transform
from cartopy import crs
from matplotlib import colors
from IPython.display import clear_output, display, HTML
from bokeh.models import HoverTool, FuncTickFormatter, FixedTicker
# optionally, enable shapely.speedups
# which makes some of the spatial
# queries running faster
import shapely.speedups as speedups
from shapely import geometry
Activate autoreload of changed python files:
%load_ext autoreload
%autoreload 2
Load helper module.
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
sys.path.append(module_path)
from modules.base import tools, preparations
from modules.base import grid as gr
Initialize Bokeh and shapely.speedups
preparations.init_imports()
Parameters¶
Define initial parameters that affect processing
WORK_DIR = Path.cwd().parents[0] / "tmp" # Working directory
EPSG_CODE = 25832 # Target projection EPSG Code
CRS_PROJ = f"epsg:{EPSG_CODE}" # Target projection
CRS_WGS = "epsg:4326" # Input projection (Web Mercator)
OUTPUT = Path.cwd().parents[0] / "out" # Define path to output directory (figures etc.)
GRID_SIZE_METERS = 10000 # Define Grid size in Meters
WORK_DIR.mkdir(exist_ok=True)
tools.create_paths(OUTPUT)
Folder structure and input data¶
root = Path.cwd().parents[0] / "data"
tools.tree(Path.cwd().parents[0], ignore_files_folders=["out", "tmp"])
ORIGIN_DE_HLL = root / "2024-09-09_origin_DE_HLL.csv"
Create grid¶
Define projection crs string for pyproj/Proj4 and WGS1984 ahead of time
# define Transformer ahead of time
# with xy-order of coordinates
PROJ_TRANSFORMER = Transformer.from_crs(
CRS_WGS, CRS_PROJ, always_xy=True)
# also define reverse projection
PROJ_TRANSFORMER_BACK = Transformer.from_crs(
CRS_PROJ, CRS_WGS, always_xy=True)
Get Shapefile for Germany¶
source_zip = "https://daten.gdz.bkg.bund.de/produkte/vg/vg2500/aktuell/"
filename = "vg2500_12-31.utm32s.shape.zip"
shapes_name = "vg2500_12-31.utm32s.shape/vg2500/VG2500_LAN.shp"
SHAPE_DIR = (OUTPUT / "shapes")
SHAPE_DIR.mkdir(exist_ok=True)
if not (SHAPE_DIR / shapes_name).exists():
tools.get_zip_extract(uri=source_zip, filename=filename, output_path=SHAPE_DIR)
else:
print("Already exists")
shapes = gp.read_file(SHAPE_DIR / shapes_name)
shapes = shapes.to_crs(f"EPSG:{EPSG_CODE}")
Create overlay
ax = shapes.plot(color='none', edgecolor='black', linewidth=0.2, figsize=(2, 4))
ax.set_axis_off()
Get bounds
XMIN, YMIN, XMAX, YMAX = shapes.total_bounds
print(f'Projected bounds: {[XMIN, YMIN, XMAX, YMAX]}')
Make Grid¶
def create_grid_df(
grid_size: int = GRID_SIZE_METERS,
xmin: float = XMIN, ymin: float = YMIN,
xmax: float = XMAX, ymax: float = YMAX,
report: bool = None,
return_rows_cols: bool = None):
"""Creates dataframe polygon grid based on width and length in Meters"""
width = grid_size
length = grid_size
cols = list(range(int(np.floor(xmin)), int(np.ceil(xmax)), width))
rows = list(range(int(np.floor(ymin)), int(np.ceil(ymax)), length))
if report:
print(len(cols))
print(len(rows))
rows.reverse()
polygons = []
for x in cols:
for y in rows:
# combine to tuple: (x,y, poly)
# and append to list
polygons.append(
(x, y,
Polygon([
(x,y),
(x+width, y),
(x+width, y-length),
(x, y-length)])) )
# create a pandas dataframe
# from list of tuples
grid = pd.DataFrame(polygons)
# name columns
col_labels=['xbin', 'ybin', 'bin_poly']
grid.columns = col_labels
# use x and y as index columns
grid.set_index(['xbin', 'ybin'], inplace=True)
if return_rows_cols:
return grid, rows, cols
return grid
grid, rows, cols = create_grid_df(
grid_size=GRID_SIZE_METERS,
report=True, return_rows_cols=True)
Create a geodataframe from dataframe:
def grid_to_gdf(
grid: pd.DataFrame, crs_proj: str = CRS_PROJ) -> gp.GeoDataFrame:
"""Convert grid pandas DataFrame to geopandas Geodataframe"""
grid = gp.GeoDataFrame(
grid.drop(
columns=["bin_poly"]),
geometry=grid.bin_poly)
grid.crs = crs_proj
return grid
grid = grid_to_gdf(grid)
Preview Grid¶
Plot grid and combine with DE geometry
base = grid.plot(figsize=(6, 9), color='white', edgecolor='black', linewidth=0.1)
plot = shapes.plot(ax=base, alpha=0.7, color="#7aa0c4", edgecolor='black', linewidth=0.25)
base.set_axis_off()
YBINS = np.array(rows)
XBINS = np.array(cols)
Process data¶
We want to aggregate Geosocial Post locations to grid bins. For this, we largely follow the processing described here.
Preview input data:
pd.set_option('display.max_colwidth', 150)
tools.record_preview_hll(ORIGIN_DE_HLL)
Define credentials as environment variables and load here
DB_USER = "postgres"
DB_PASS = os.getenv('POSTGRES_PASSWORD')
# set connection variables
DB_HOST = "hlldb"
DB_PORT = "5432"
DB_NAME = "hlldb"
Connect to empty Postgres database running HLL Extension:
DB_CONN = psycopg2.connect(
host=DB_HOST,
port=DB_PORT ,
dbname=DB_NAME,
user=DB_USER,
password=DB_PASS
)
DB_CONN.set_session(
readonly=True)
DB_CALC = tools.DbConn(
DB_CONN)
CUR_HLL = DB_CONN.cursor()
Test connection:
CUR_HLL.execute("SELECT 1;")
print(CUR_HLL.statusmessage)
Plotting DE maps: Post Count¶
gr.reset_metrics(grid)
kwds = {
"data": ORIGIN_DE_HLL,
"grid": grid,
"store_benchmark_data": False,
"proj_transformer": PROJ_TRANSFORMER,
"proj_transformer_back": PROJ_TRANSFORMER_BACK,
"xbins": XBINS, "ybins": YBINS,
"db_calc": DB_CALC,
"store_benchmark_data": False,
"output": OUTPUT,
"bg": shapes, "figsize": (4, 9),
"usecols": ['latitude', 'longitude', 'origin_id'],
"scheme": "Quantiles",
"cmap_name": "bone",
"inverse": False,
"edgecolor": "black"
}
Plot for individual origins
origin_mapping = {
"instagram":1,
"flickr":2,
"twitter":3,
"inaturalist":23,
}
%%time
gr.load_plot(
title=f'Estimated Post Count (all) per 10 km grid, 2007-2022',
filename="postcount_est",
**kwds)
%%time
gr.load_plot(
title=f'Estimated Post Count (Instagram) per 10 km grid, 2007-2022',
filename="postcount_est", origin=1, **kwds)
%%time
gr.load_plot(
title=f'Estimated Post Count (Flickr) per 10 km grid, 2007-2022',
filename="postcount_est", origin=2, **kwds)
%%time
gr.load_plot(
title=f'Estimated Post Count (Twitter) per 10 km grid, 2007-2022',
filename="postcount_est", origin=3, **kwds)
%%time
gr.load_plot(
title=f'Estimated Post Count (iNaturalist) per 10 km grid, 2007-2022',
filename="postcount_est", origin=23, **kwds)
Create notebook HTML¶
!jupyter nbconvert --to html_toc \
--output-dir=../resources/html/ ./02_grid_exploration.ipynb \
--template=../nbconvert.tpl \
--ExtractOutputPreprocessor.enabled=False >&- 2>&-