Grid-based Exploration of geosocial patterns (Germany) ¶

Alexander Dunkel, Leibniz Institute of Ecological Urban and Regional Development,
Transformative Capacities & Research Data Centre (IÖR-FDZ)

No description has been provided for this image

In this notebook, we synthesize further and look at patterns in aggregated grid cells.

Prepare environment¶

To run this notebook, as a starting point, you have two options:

1. Create an environment with the packages and versions shown in the following cell.

As a starting point, you may use the latest conda environment_default.yml from our CartoLab docker container.

2. If docker is available to you, we suggest to use the Carto-Lab Docker Container

Clone the repository and edit your .env value to point to the repsitory, where this notebook can be found, e.g.:

git clone https://gitlab.vgiscience.de/lbsn/tools/jupyterlab.git
cd jupyterlab
cp .env.example .env
nano .env
## Enter:
# JUPYTER_NOTEBOOKS=~/notebooks/geosocial_patterns_de
# TAG=v0.12.3
docker network create lbsn-network
docker-compose pull && docker-compose up -d

Load dependencies:

import os
import csv
import sys
import colorcet
import psycopg2 # Postgres API
import geoviews as gv
import holoviews as hv
import mapclassify as mc
import geopandas as gp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geoviews.feature as gf
from pathlib import Path
from collections import namedtuple
from pathlib import Path
from typing import List, Tuple, Dict, Optional
from pyproj import Transformer, CRS, Proj
from geoviews import opts
from shapely.geometry import shape, Point, Polygon
from shapely.ops import transform
from cartopy import crs
from matplotlib import colors
from IPython.display import clear_output, display, HTML
from bokeh.models import HoverTool, FuncTickFormatter, FixedTicker
# optionally, enable shapely.speedups 
# which makes some of the spatial 
# queries running faster
import shapely.speedups as speedups
from shapely import geometry

Activate autoreload of changed python files:

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Load helper module.

module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
    sys.path.append(module_path)
from modules.base import tools, preparations
from modules.base import grid as gr

Initialize Bokeh and shapely.speedups

preparations.init_imports()

Parameters¶

Define initial parameters that affect processing

WORK_DIR = Path.cwd().parents[0] / "tmp"     # Working directory
EPSG_CODE = 25832                            # Target projection EPSG Code
CRS_PROJ = f"epsg:{EPSG_CODE}"               # Target projection
CRS_WGS = "epsg:4326"                        # Input projection (Web Mercator)
OUTPUT = Path.cwd().parents[0] / "out"       # Define path to output directory (figures etc.)
GRID_SIZE_METERS = 10000                     # Define Grid size in Meters

WORK_DIR.mkdir(exist_ok=True)

tools.create_paths(OUTPUT)

Folder structure and input data¶

root = Path.cwd().parents[0] / "data"

tools.tree(Path.cwd().parents[0], ignore_files_folders=["out", "tmp"])

.├── py│   ├── _01_overview_de.py│   ├── _02_grid_exploration.py│   ├── _00_raw_hll_conversion.py│   └── modules│       └── base│           ├── tools.py│           ├── raster.py│           ├── README.md│           ├── .gitignore│           ├── grid.py│           ├── hll.py│           ├── pkginstall.sh│           └── preparations.py├── nbconvert.tpl├── README.md├── .pandoc│   ├── readme.css│   ├── favicon-32x32.png│   ├── favicon-16x16.png│   └── readme.html├── notebooks│   ├── 00_raw_hll_conversion.ipynb│   ├── 01_overview_de.ipynb│   ├── 02_grid_exploration.ipynb│   └── .gitkeep├── pyproject.toml├── .gitignore├── resources│   ├── geosocial_patterns_de.png│   └── html│       ├── 02_grid_exploration.html│       ├── 01_overview_de.html│       └── 00_raw_hll_conversion.html├── jupytext.toml├── data│   ├── 2024-07-31_DE_All_exportAllLatLng.csv│   └── 2024-09-09_origin_DE_HLL.csv├── .gitlab-ci.yml├── CHANGELOG.md├── LICENSE.md├── .gitmodules├── md│   ├── 00_raw_hll_conversion.md│   ├── 02_grid_exploration.md│   └── 01_overview_de.md├── conf.json└── .version
9 directories, 39 files

ORIGIN_DE_HLL = root / "2024-09-09_origin_DE_HLL.csv"

Create grid¶

Define projection crs string for pyproj/Proj4 and WGS1984 ahead of time

# define Transformer ahead of time
# with xy-order of coordinates
PROJ_TRANSFORMER = Transformer.from_crs(
    CRS_WGS, CRS_PROJ, always_xy=True)

# also define reverse projection
PROJ_TRANSFORMER_BACK = Transformer.from_crs(
    CRS_PROJ, CRS_WGS, always_xy=True)

Get Shapefile for Germany¶

source_zip = "https://daten.gdz.bkg.bund.de/produkte/vg/vg2500/aktuell/"
filename = "vg2500_12-31.utm32s.shape.zip"
shapes_name = "vg2500_12-31.utm32s.shape/vg2500/VG2500_LAN.shp"

SHAPE_DIR = (OUTPUT / "shapes")
SHAPE_DIR.mkdir(exist_ok=True)

if not (SHAPE_DIR / shapes_name).exists():
    tools.get_zip_extract(uri=source_zip, filename=filename, output_path=SHAPE_DIR)
else:
    print("Already exists")

Already exists

shapes = gp.read_file(SHAPE_DIR / shapes_name)
shapes = shapes.to_crs(f"EPSG:{EPSG_CODE}")

Create overlay

ax = shapes.plot(color='none', edgecolor='black', linewidth=0.2, figsize=(2, 4))
ax.set_axis_off()

Get bounds

XMIN, YMIN, XMAX, YMAX = shapes.total_bounds

print(f'Projected bounds: {[XMIN, YMIN, XMAX, YMAX]}')

Projected bounds: [280353.088648699, 5235877.55594778, 921021.1540886953, 6106245.141762934]

Make Grid¶

def create_grid_df(
    grid_size: int = GRID_SIZE_METERS,
    xmin: float = XMIN, ymin: float = YMIN, 
    xmax: float = XMAX, ymax: float = YMAX,
    report: bool = None,
    return_rows_cols: bool = None):
    """Creates dataframe polygon grid based on width and length in Meters"""
    width = grid_size
    length = grid_size
    cols = list(range(int(np.floor(xmin)), int(np.ceil(xmax)), width))
    rows = list(range(int(np.floor(ymin)), int(np.ceil(ymax)), length))
    if report:
        print(len(cols))
        print(len(rows))
    rows.reverse()
    
    polygons = []
    for x in cols:
        for y in rows:
            # combine to tuple: (x,y, poly)
            # and append to list
            polygons.append(
                (x, y, 
                 Polygon([
                     (x,y), 
                     (x+width, y), 
                     (x+width, y-length), 
                     (x, y-length)])) )
    # create a pandas dataframe
    # from list of tuples
    grid = pd.DataFrame(polygons)
    # name columns
    col_labels=['xbin', 'ybin', 'bin_poly']
    grid.columns = col_labels
    # use x and y as index columns
    grid.set_index(['xbin', 'ybin'], inplace=True)
    if return_rows_cols:
        return grid, rows, cols
    return grid

grid, rows, cols = create_grid_df(
    grid_size=GRID_SIZE_METERS,
    report=True, return_rows_cols=True)

65
88

Create a geodataframe from dataframe:

def grid_to_gdf(
    grid: pd.DataFrame, crs_proj: str = CRS_PROJ) -> gp.GeoDataFrame:
    """Convert grid pandas DataFrame to geopandas Geodataframe"""
    grid = gp.GeoDataFrame(
        grid.drop(
            columns=["bin_poly"]),
            geometry=grid.bin_poly)
    grid.crs = crs_proj
    return grid

grid = grid_to_gdf(grid)

Preview Grid¶

Plot grid and combine with DE geometry

base = grid.plot(figsize=(6, 9), color='white', edgecolor='black', linewidth=0.1)
plot = shapes.plot(ax=base, alpha=0.7, color="#7aa0c4", edgecolor='black', linewidth=0.25)
base.set_axis_off()

YBINS = np.array(rows)
XBINS = np.array(cols)

Process data¶

We want to aggregate Geosocial Post locations to grid bins. For this, we largely follow the processing described here.

Preview input data:

pd.set_option('display.max_colwidth', 150)
tools.record_preview_hll(ORIGIN_DE_HLL)

Preparations for HLL calculation¶

Password and username for connecting to local hllworker are loaded from environment.

Define credentials as environment variables and load here

DB_USER = "postgres"
DB_PASS = os.getenv('POSTGRES_PASSWORD')
# set connection variables
DB_HOST = "hlldb"
DB_PORT = "5432"
DB_NAME = "hlldb"

Connect to empty Postgres database running HLL Extension:

DB_CONN = psycopg2.connect(
        host=DB_HOST,
        port=DB_PORT ,
        dbname=DB_NAME,
        user=DB_USER,
        password=DB_PASS
)
DB_CONN.set_session(
    readonly=True)
DB_CALC = tools.DbConn(
    DB_CONN)
CUR_HLL = DB_CONN.cursor()

Test connection:

CUR_HLL.execute("SELECT 1;")
print(CUR_HLL.statusmessage)

SELECT 1

Plotting DE maps: Post Count¶

gr.reset_metrics(grid)

kwds = {
    "data": ORIGIN_DE_HLL,
    "grid": grid,
    "store_benchmark_data": False,
    "proj_transformer": PROJ_TRANSFORMER,
    "proj_transformer_back": PROJ_TRANSFORMER_BACK,
    "xbins": XBINS, "ybins": YBINS,
    "db_calc": DB_CALC,
    "store_benchmark_data": False,
    "output": OUTPUT,
    "bg": shapes, "figsize": (4, 9),
    "usecols": ['latitude', 'longitude', 'origin_id'],
    "scheme": "Quantiles",
    "cmap_name": "bone",
    "inverse": False,
    "edgecolor": "black"
}

Plot for individual origins

origin_mapping = {
    "instagram":1,
    "flickr":2,
    "twitter":3,
    "inaturalist":23,
}

%%time
gr.load_plot(
    title=f'Estimated Post Count (all) per 10 km grid, 2007-2022',
    filename="postcount_est",
    **kwds)

Mapped ~5000000 coordinates to bins
Plotting figure..
Classifying bins..
Formatting legend..
Storing figure as png..
CPU times: user 4.54 s, sys: 770 ms, total: 5.31 s
Wall time: 8.31 s

%%time
gr.load_plot(
    title=f'Estimated Post Count (Instagram) per 10 km grid, 2007-2022',
    filename="postcount_est", origin=1, **kwds)

Mapped ~5000000 coordinates to bins
Plotting figure..
Classifying bins..
Formatting legend..
Storing figure as png..
CPU times: user 3.28 s, sys: 559 ms, total: 3.84 s
Wall time: 3.94 s

%%time
gr.load_plot(
    title=f'Estimated Post Count (Flickr) per 10 km grid, 2007-2022',
    filename="postcount_est", origin=2, **kwds)

Mapped ~5000000 coordinates to bins
Plotting figure..
Classifying bins..
Formatting legend..
Storing figure as png..
CPU times: user 3.41 s, sys: 595 ms, total: 4 s
Wall time: 4.72 s

%%time
gr.load_plot(
    title=f'Estimated Post Count (Twitter) per 10 km grid, 2007-2022',
    filename="postcount_est", origin=3, **kwds)

Mapped ~5000000 coordinates to bins
Plotting figure..
Classifying bins..
Formatting legend..
Storing figure as png..
CPU times: user 3.81 s, sys: 664 ms, total: 4.47 s
Wall time: 4.6 s

%%time
gr.load_plot(
    title=f'Estimated Post Count (iNaturalist) per 10 km grid, 2007-2022',
    filename="postcount_est", origin=23, **kwds)

Mapped ~5000000 coordinates to bins
Plotting figure..
Classifying bins..
Formatting legend..
Storing figure as png..
CPU times: user 3.22 s, sys: 587 ms, total: 3.81 s
Wall time: 3.75 s

Create notebook HTML¶

!jupyter nbconvert --to html_toc \
    --output-dir=../resources/html/ ./02_grid_exploration.ipynb \
    --template=../nbconvert.tpl \
    --ExtractOutputPreprocessor.enabled=False >&- 2>&-

	0
Record 0
origin_id	23
latitude	47.27142333984375
longitude	10.1788330078125
user_hll	\x138b403885d021
post_hll	\x138b4066c3a061ada1
date_hll	\x138b402ae3af02d062