PLZ Visualization Geosocial Data for DE ¶
Alexander Dunkel, Leibniz Institute of Ecological Urban and Regional Development,
Transformative Capacities & Research Data Centre (IÖR-FDZ)
Base data:
- 50 Million Geosocial Media posts in HLL format
- sorted based on user-id, number of posts per user-id ascending
- ~9000 PLZ areas (shapes)
Preparations¶
Install additional dependencies not in worker_env
Load dependencies:
import mapclassify as mc
import geopandas as gp
import pandas as pd
import numpy as np
import hdbscan
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import textwrap
from pathlib import Path
from datetime import datetime
Activate autoreload of changed python files:
%load_ext autoreload
%autoreload 2
Load helper module.
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
sys.path.append(module_path)
from modules.base import tools, preparations
Initialize Bokeh and shapely.speedups
preparations.init_imports()
Parameters¶
Define initial parameters that affect processing
WORK_DIR = Path.cwd().parents[0] / "tmp" # Working directory
EPSG_CODE = 25832 # Target projection EPSG Code
CRS_PROJ = f"epsg:{EPSG_CODE}" # Target projection
CRS_WGS = "epsg:4326" # Input projection (Web Mercator)
OUTPUT = Path.cwd().parents[0] / "out" # Define path to output directory (figures etc.)
SHAPE_DIR = OUTPUT / "shapes" # Shapefiles
(OUTPUT / "figures").mkdir(exist_ok=True)
(OUTPUT / "svg").mkdir(exist_ok=True)
WORK_DIR.mkdir(exist_ok=True)
today = datetime.today().strftime('%Y-%m-%d')
Please turn this off during development:
warnings.filterwarnings("ignore")
warnings.filterwarnings('ignore', category=DeprecationWarning)
tools.create_paths(OUTPUT)
Set global scientific font
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.serif'] = ['Times New Roman'] + plt.rcParams['font.serif']
PLZ shapes¶
This file was loaded in the previous notebook.
plz_shapes = OUTPUT / "plz-5stellig.shp"
Load from file and plot preview
shapes = gp.read_file(plz_shapes, encoding='utf-8')
shapes.columns
ax = shapes.plot()
ax.set_axis_off()
len(shapes)
Preview PLZ Geosocial Counts¶
Load CSV from the previous notebook.
df_plz = pd.read_csv(OUTPUT / f"2025-06-11_PLZ_DE_Geosocial.csv", dtype={"PLZ": str})
Join based on PLZ to shapes
shapes.set_index("plz", inplace=True)
df_plz.set_index("PLZ", inplace=True)
Convert both indices to string and strip spaces
shapes.index = shapes.index.astype(str).str.strip()
df_plz.index = df_plz.index.astype(str).str.strip()
df_plz.head()
plz_counts_df = shapes.merge(
df_plz[["postcount", "usercount"]],
left_index=True, right_index=True, how="left")
Conert float to int for the counts
plz_counts_df['usercount'] = plz_counts_df['usercount'].fillna(0).astype(int)
plz_counts_df['postcount'] = plz_counts_df['postcount'].fillna(0).astype(int)
plz_counts_df.head()
plz_counts_df.loc["10179"]
Visualize¶
First project to UTM zone 32N (suitable for for Germany).
epsg_code = 25832
plz_counts_df.to_crs(f"EPSG:{epsg_code}", inplace=True)
Get Shapefile for Germany
source_zip = "https://daten.gdz.bkg.bund.de/produkte/vg/vg2500/aktuell/"
filename = "vg2500_12-31.utm32s.shape.zip"
shapes_name = "vg2500_12-31.utm32s.shape/vg2500/VG2500_LAN.shp"
SHAPE_DIR = (OUTPUT / "shapes")
SHAPE_DIR.mkdir(exist_ok=True)
if not (SHAPE_DIR / shapes_name).exists():
tools.get_zip_extract(uri=source_zip, filename=filename, output_path=SHAPE_DIR)
else:
print("Already exists")
shapes = gp.read_file(SHAPE_DIR / shapes_name)
shapes = shapes.to_crs(f"EPSG:{epsg_code}")
Create overlay
def plot_map(ax, title, caption=None):
leg = ax.get_legend()
leg.set_bbox_to_anchor((0., 0., 0.0, 0.3))
leg.get_frame().set_edgecolor('none') # remove edge line
ax.set_title(title)
ax.set_axis_off()
if caption is None:
return
wrapped_caption = textwrap.fill(caption, width=80)
plt.figtext(0.5, 0.15, wrapped_caption, ha='center', va='top', fontsize=9)
title = "PLZ Geosocial Media Usercount\nQuantiles \n (Instagram, Flickr, Twitter, iNaturalist)"
ax = plz_counts_df.plot(
column='usercount', cmap='OrRd', scheme='Quantiles', figsize=(5, 8), legend=True)
ax = shapes.plot(ax=ax, color='none', edgecolor='white', linewidth=1.5)
plot_map(ax, title)
A different classification scheme for improved interpretability:
title = "PLZ Geosocial Media Usercount \nNatural Breaks\n (Instagram, Flickr, Twitter, iNaturalist)"
ax = plz_counts_df.plot(
column='usercount', cmap='OrRd', scheme='NaturalBreaks', k=5, figsize=(5, 8), legend=True)
ax = shapes.plot(ax=ax, color='none', edgecolor='white', linewidth=1.5)
plot_map(ax, title)
Active user per population ratio¶
- Replace zero or missing population with np.nan to avoid division errors
- Divide usercount by einwohner (population counts), per PLZ area
plz_counts_df['einwohner'] = plz_counts_df['einwohner'].replace(0, np.nan)
plz_counts_df["user_pop_ratio"] = plz_counts_df["usercount"] / plz_counts_df["einwohner"]
Let's look at the distribution of values
classifier = mc.Quantiles(plz_counts_df['user_pop_ratio'], k=5)
print("Class breaks:", classifier.bins)
The values are very skewed, which is expected for ratios derived from small denominators (populations).
We can create a more balanced map by preprocessing data with a Log Transformation and then classifying this log-transformed data.
- Apply log1p to handle zeroes safely
- Classify and plot the log-transformed data
plz_counts_df['log_user_pop_ratio'] = np.log1p(plz_counts_df['user_pop_ratio'])
cluster_input = plz_counts_df.dropna(subset=['log_user_pop_ratio'])
ax = cluster_input.plot(
column='log_user_pop_ratio', cmap='OrRd', scheme='NaturalBreaks', k=5, figsize=(5, 8), legend=True)
ax = shapes.plot(ax=ax, color='none', edgecolor='white', linewidth=1.5)
plot_map(ax, "Log-scaled Active Geosocial media Users\nper Population Ratio\n(PLZ based evaluation)")
The red areas have an unusually high level of social media activity compared to population density. Areas such as Saxon Switzerland, a popular recreational area near Dresden, stand out. Other areas include the Alpine foothills, the areas surrounding Berlin, the Baltic coast and Freiburg. All of these areas are particularly popular tourist destinations in Germany. This map therefore confirms our expectations, thereby increasing trust in the data.
Areas where the population is high but social media user activity is low¶
Let's look at the inverse mapping of user concentration relative to population density, to highlight underrepresented or "locally dense but socially quiet" areas.
plz_counts_df["pop_user_ratio"] = plz_counts_df["einwohner"] / (plz_counts_df["usercount"] + .1)
plz_counts_df['log_pop_user_ratio'] = np.log1p(plz_counts_df['pop_user_ratio'])
cluster_input = plz_counts_df.dropna(subset=['log_pop_user_ratio'])
ax = cluster_input.plot(
column='log_pop_user_ratio', cmap='YlGnBu', scheme='NaturalBreaks', k=5, figsize=(5, 8), legend=True)
ax = shapes.plot(ax=ax, color='none', edgecolor='white', linewidth=1.5)
plot_map(ax, "Local Dominance: High Population / Low Geosocial Media Activity")
Account for area bias¶
We also need to account for area bias. Large PLZs often cover rural or suburban regions and can artificially appear to have low user density or high local dominance simply because their population and user counts are spread over a larger area.
Calculate area.
plz_counts_df['area_m2'] = plz_counts_df.geometry.area
plz_counts_df['area_km2'] = plz_counts_df['area_m2'] / 1e6
Compute densities.
plz_counts_df['pop_density'] = \
plz_counts_df['einwohner'] / plz_counts_df['area_km2']
plz_counts_df['user_density'] = \
plz_counts_df['usercount'] / plz_counts_df['area_km2']
Update ratios to a density-based metric.
plz_counts_df['density_ratio'] = \
plz_counts_df['pop_density'] / (plz_counts_df['user_density']+.1)
plz_counts_df['log_density_ratio'] = \
np.log1p(plz_counts_df['density_ratio'])
cluster_input = plz_counts_df.dropna(subset=['log_density_ratio'])
ax = cluster_input.plot(
column='log_density_ratio', cmap='YlGnBu', scheme='NaturalBreaks', k=5, figsize=(5, 8), legend=True)
ax = shapes.plot(ax=ax, color='none', edgecolor='white', linewidth=1.5)
plot_map(ax, "Local Dominance: High Population / Low Geosocial Media Activity \n(area normalized)")
Export map to SVG and PNG
fig = ax.figure
tools.save_fig(fig, output=OUTPUT, name=f"{today}_log_density_ratio_map")
Extract PLZ selection ranking¶
List the top PLZ with more than 5000 population and more than 5000 active geosocial media users.
cluster_input.head()
top_plz = (
cluster_input[
(cluster_input['einwohner'] >= 5000) &
(cluster_input['usercount'] >= 5000) &
(cluster_input['area_km2'] <= 50) # adjust threshold
]
.sort_values(by='log_density_ratio', ascending=False)
)
print(f"Identified {len(top_plz)} PLZ with more than 5000 population and more than 5000 active geosocial media users")
print(top_plz.assign(plz=top_plz.index).head(10)[
['plz', 'einwohner', 'usercount', 'pop_user_ratio', 'log_pop_user_ratio', 'log_density_ratio']]
.to_string(index=False, float_format='{:,.2f}'.format))
What we did:
- calculate ratios per PLZ between population and geosocial media activity
- Normalize ratios by PLZ area, to avoid area bias for large PLZ
- Find areas where the local population is large relative to geosocial media user activity.
- Applying minimum thresholds (5000 population and 5000 users) ensures statistical robustness and avoids noisy small-sample PLZ.
- Applying a maximum threshold of 50km² for PLZ selection, to avoid overselecting few large-area PLZs with sparse user densities
- Sorting by
log_pop_user_ratio
in descending order correctly prioritizes areas with the highest local dominance.
Visualize these 423 PLZ
ax = top_plz.plot(
column='log_density_ratio', cmap='YlGnBu', scheme='NaturalBreaks', k=5, figsize=(5, 8), legend=True)
ax = shapes.plot(ax=ax, color='none', edgecolor='white', linewidth=1.5)
plot_map(
ax=ax,
title="Local Dominance: High Population / Low Geosocial Media Activity\n" \
"(but more than 5000 population and\nmore than 5000 active geosocial media users)",
caption = (
"The map highlights postal code areas (PLZ) where the local population is significantly "
"overrepresented compared to the number of active geosocial media users. "
"These areas are likely less touristic or less digitally active in terms of geosocial platforms. "
"Only PLZs with more than 5,000 residents and more than 5,000 active users were considered to "
"ensure statistical relevance. In addition, very large and sparsely populated PLZ where excluded "
"by a maximum 50 km² area threshold."))
fig = ax.figure
tools.save_fig(fig, output=OUTPUT, name=f"{today}_log_density_ratio_PLZ_selection")
Identify clusters¶
We just want to see distribution and identify major clusters. An approach is shown by João Paulo Figueira's Geographic Clustering with HDBSCAN that I used in a previous notebook).
The critical parameter below is min_cluster_size
(lower values result in more but closer located clusters, larger values will aggregate more, but may lead to far apart islands).
top_plz_centroids_wgs84 = top_plz.geometry.to_crs(epsg=4326).centroid
lon = top_plz_centroids_wgs84.x
lat = top_plz_centroids_wgs84.y
data_for_hdbscan = np.array(list(zip(lat, lon)))
X_radians = np.radians(data_for_hdbscan)
min_pts_in_cluster = 2
clusterer = hdbscan.HDBSCAN(
min_cluster_size=min_pts_in_cluster,
metric='haversine',
allow_single_cluster=True
)
clusterer.fit(X_radians)
top_plz['cluster_label'] = clusterer.labels_
num_clusters = len(set(clusterer.labels_)) - (1 if -1 in clusterer.labels_ else 0)
num_noise = (clusterer.labels_ == -1).sum()
print(f"HDBSCAN found {num_clusters} clusters and {num_noise} noise points.")
unique_labels = sorted(list(set(clusterer.labels_)))
palette = sns.color_palette('viridis', num_clusters)
cluster_color_map = {
label: palette[i] for i, label in enumerate(
l for l in unique_labels if l != -1)}
cluster_color_map[-1] = (0.7, 0.7, 0.7, 1)
fig, ax = plt.subplots(1, 1, figsize=(8, 12))
top_plz.plot(
ax=ax,
column='cluster_label',
categorical=True,
legend=True,
legend_kwds={'title': "Cluster ID", 'loc': 'lower left', 'bbox_to_anchor': (0.0, 0.0)},
cmap=plt.cm.colors.ListedColormap([cluster_color_map[label] for label in unique_labels]),
missing_kwds={
"color": "lightgrey",
"edgecolor": "red",
"hatch": "///",
"label": "No Cluster (Noise)"
}
)
shapes.plot(ax=ax, color='none', edgecolor='black', linewidth=0.5, zorder=0)
plot_map(
ax=ax,
title=f"Geographic Clusters (HDBSCAN) of Selected PLZ Areas\n({num_clusters} clusters, min_size={min_pts_in_cluster})",
)
plt.tight_layout()
plt.show()
fig = ax.figure
tools.save_fig(fig, output=OUTPUT, name=f"{today}_cluster_map")
Convert to Convex Hulls/Alpha Shapes¶
Ideally, we want to extract contiguous areas of PLZ with no gaps or isolated areas, including PLZ that are not directly identified as clusters, provided they are in close proximity to clusters. We can achieve this using a Convex Hull/Alpha Shapes approach.
BUFFER_DISTANCE_METERS = 1000
all_plz_shapes = gp.read_file(plz_shapes, encoding='utf-8')
if top_plz.crs != all_plz_shapes.crs:
all_plz_shapes = all_plz_shapes.to_crs(top_plz.crs)
expanded_cluster_plzs_list = []
unique_cluster_ids = sorted([cid for cid in top_plz['cluster_label'].unique() if cid != -1])
for cid in unique_cluster_ids:
cluster_geometries = top_plz[top_plz['cluster_label'] == cid].geometry
if cluster_geometries.empty:
continue
cluster_union = cluster_geometries.unary_union
cluster_hull = cluster_union.convex_hull
buffered_hull = cluster_hull.buffer(BUFFER_DISTANCE_METERS)
hull_gdf = gp.GeoDataFrame(
[{'geometry': buffered_hull, 'cluster_id_hull': cid}], crs=top_plz.crs)
plzs_in_buffered_hull = gp.sjoin(
all_plz_shapes, hull_gdf, how="inner", predicate="intersects")
if not plzs_in_buffered_hull.empty:
selected_plzs_with_id = plzs_in_buffered_hull[['plz', 'geometry']].copy()
selected_plzs_with_id['expanded_cluster_id'] = cid
expanded_cluster_plzs_list.append(selected_plzs_with_id)
all_expanded_plzs = pd.concat(expanded_cluster_plzs_list)
final_expanded_clusters = all_expanded_plzs.dissolve(
by='expanded_cluster_id', aggfunc='first')
final_expanded_clusters = final_expanded_clusters.reset_index()
print(f"Created {len(final_expanded_clusters)} expanded cluster areas.")
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
final_expanded_clusters.plot(
ax=ax, column='expanded_cluster_id', categorical=True,
legend=True, alpha=0.7, legend_kwds={'ncol': 2, 'title': 'Expanded Cluster ID'})
top_plz[top_plz['cluster_label'] != -1].plot(
ax=ax, column='cluster_label', categorical=True, legend=False,
edgecolor='black', linewidth=0.5, alpha=0.5)
shapes.plot(ax=ax, color='none', edgecolor='black', linewidth=0.2, zorder=0)
ax.set_axis_off()
ax.set_title(f"Geographic Clusters (HDBSCAN) of Selected PLZ Areas as Convex Hulls/Alpha Shapes")
leg = ax.get_legend()
leg.set_bbox_to_anchor((0., 0., 0.0, 0.6))
leg.get_frame().set_edgecolor('none')
plt.show()
Merge back to original PLZ shapes, so we can export this.
if final_expanded_clusters.crs != all_plz_shapes.crs:
all_plz_shapes = all_plz_shapes.to_crs(final_expanded_clusters.crs)
plz_in_expanded_clusters = gp.sjoin(
all_plz_shapes,
final_expanded_clusters[['expanded_cluster_id', 'geometry']],
how="inner",
predicate="intersects")
if 'index_right' in plz_in_expanded_clusters.columns:
plz_in_expanded_clusters = plz_in_expanded_clusters.drop(columns=['index_right'])
len(plz_in_expanded_clusters)
Export PLZ selection¶
Export to GeoPackage, Shapefile and CSV (geometry dropped). Remove not relevant columns first.
top_plz.columns
tools.drop_cols_except(
df=top_plz,
columns_keep=[
"geometry", "einwohner", "plz", "qkm", "cluster_label",
"note", "usercount", "postcount", "log_density_ratio"])
top_plz['einwohner'] = top_plz['einwohner'].astype('Int64')
top_plz.to_file(
OUTPUT / f"{today}_top_plz.gpkg", layer="top_plz", driver="GPKG")
top_plz.drop(columns='geometry').to_csv(
OUTPUT / f"{today}_top_plz.csv", index=True, encoding='utf-8')
top_plz.to_file(
OUTPUT / f"{today}_top_plz.shp", driver="ESRI Shapefile")
Do the same for final_expanded_clusters
plz_in_expanded_clusters.to_file(
OUTPUT / f"{today}_expanded_clusters.gpkg", layer="expanded_clusters", driver="GPKG")
plz_in_expanded_clusters.drop(columns='geometry').to_csv(
OUTPUT / f"{today}_expanded_clusters.csv", index=True, encoding='utf-8')
plz_in_expanded_clusters.to_file(
OUTPUT / f"{today}_expanded_clusters.shp", driver="ESRI Shapefile")
Create notebook HTML¶
!jupyter nbconvert --to html_toc \
--output-dir=../resources/html/ ./07_plz_visualization.ipynb \
--template=../nbconvert.tpl \
--ExtractOutputPreprocessor.enabled=False >&- 2>&-