Social Transformative Capacity ¶
Alexander Dunkel, Leibniz Institute of Ecological Urban and Regional Development,
Transformative Capacities & Research Data Centre (IÖR-FDZ)
Base data:
- 50 Million Geosocial Media posts
- sorted based on user-id, number of posts per user-id ascending
- use LLM to classify users
Preparations¶
Load dependencies:
import os
import csv
import sys
import colorcet
import psycopg2 # Postgres API
import geoviews as gv
import holoviews as hv
import mapclassify as mc
import geopandas as gp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geoviews.feature as gf
from pathlib import Path
from collections import namedtuple
from pathlib import Path
from typing import List, Tuple, Dict, Optional
from pyproj import Transformer, CRS, Proj
from geoviews import opts
from shapely.geometry import shape, Point, Polygon
from shapely.ops import transform
from holoviews import dim
from cartopy import crs
from matplotlib import colors
from IPython.display import clear_output, display, HTML, Markdown, Code
from bokeh.models import HoverTool, FuncTickFormatter, FixedTicker
# optionally, enable shapely.speedups
# which makes some of the spatial
# queries running faster
import shapely.speedups as speedups
from shapely import geometry
Activate autoreload of changed python files:
%load_ext autoreload
%autoreload 2
Load helper module.
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
sys.path.append(module_path)
from modules.base import tools, preparations
Initialize Bokeh and shapely.speedups
preparations.init_imports()
Parameters¶
Define initial parameters that affect processing
WORK_DIR = Path.cwd().parents[0] / "tmp" # Working directory
EPSG_CODE = 25832 # Target projection EPSG Code
CRS_PROJ = f"epsg:{EPSG_CODE}" # Target projection
CRS_WGS = "epsg:4326" # Input projection (Web Mercator)
OUTPUT = Path.cwd().parents[0] / "out" # Define path to output directory (figures etc.)
SHAPE_DIR = OUTPUT / "shapes" # Shapefiles
WORK_DIR.mkdir(exist_ok=True)
tools.create_paths(OUTPUT)
City shapes¶
We will get the city shape areas from the BKG product "VG250-EW", see Verwaltungsgebiete.
zip_citybd = "https://daten.gdz.bkg.bund.de/produkte/vg/vg250-ew_ebenen_1231/aktuell/vg250-ew_12-31.utm32s.gpkg.ebenen.zip"
tools.get_zip_extract(
uri_filename=zip_citybd, output_path=SHAPE_DIR)
vg_shapes = SHAPE_DIR / "vg250-ew_12-31.utm32s.gpkg.ebenen" / "vg250-ew_ebenen_1231" / "DE_VG250.gpkg"
Load from file and plot preview
shapes = gp.read_file(vg_shapes, layer="vg250_vwg")
ax = shapes.plot()
ax.set_axis_off()
Select all larger Cities¶
tools.drop_cols_except(df=shapes, columns_keep=["geometry", "EWZ", "GEN"])
shapes.head()
lcities_shapes = shapes[shapes["EWZ"]>100000]
Load Bundesländer shapes, for plotting references.
de_shapes = tools.get_shapes("de", shape_dir=SHAPE_DIR)
de_shapes = de_shapes.to_crs(f"EPSG:{EPSG_CODE}")
Plot preview in two layers, select all larger cities.
plt_kwags = {
"color":'none',
"edgecolor":'black',
"linewidth":0.2,
"figsize":(2, 4),
}
ax = de_shapes.plot(**plt_kwags)
ax = lcities_shapes.plot(ax=ax, color="red")
ax.set_axis_off()
lcities_shapes.head()
Dissolve shapes by GEN
Column (City Reference)
lcities_shapes = lcities_shapes.dissolve(by='GEN')
len(lcities_shapes)
Confirmed with 83
specified in Wikipedia article. We now have our base for measuring city sphere influence.
Intersect with Social Media Data¶
Test city: Berlin¶
Select a single city from our previous dataset: Berlin
berlin_shp = lcities_shapes[lcities_shapes.index=="Berlin"]
ax = berlin_shp.plot(**plt_kwags)
ax.set_axis_off()
Simplify shape with a 1000
m tolerance.
berlin_shp = berlin_shp.simplify(tolerance=1000)
ax = berlin_shp.plot(**plt_kwags)
ax.set_axis_off()
Get the Well-Known Text (WKT) from the shape, for using this in the PostGis query. Note that we project the shape back to WGS1984, as this is what PostGis expects for the respective column projection.
berlin_wkt = berlin_shp.to_crs(CRS_WGS).iloc[0].wkt
berlin_wkt[:50]
from dotenv import load_dotenv
load_dotenv(
Path.cwd().parents[0] / '.env', override=True)
db_user = "postgres"
db_pass = os.getenv('POSTGRES_PASSWORD')
db_host = "hlldb"
db_port = "5432"
db_name = "hlldb"
db_connection_hll = psycopg2.connect(
host=db_host,
port=db_port,
dbname=db_name,
user=db_user,
password=db_pass
)
db_conn_hll = tools.DbConn(db_connection_hll)
cur_hll = db_connection_hll.cursor()
cur_hll.execute("SELECT 1;")
db_conn = tools.DbConn(db_connection_hll)
print(cur_hll.statusmessage)
MVIEWS_REF = "all_posts_de"
Check if foreign table has been imported already:
sql_query = f"""
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_schema = 'mviews'
AND table_name = '{MVIEWS_REF}'
);
"""
result = db_conn_hll.query(sql_query)
result["exists"][0]
PostGIS SQL Query DB¶
Select all posts that have a post_latlng
within berlin_wkt
. Remove the LIMIT 10
after testing.
%%time
cols = "origin_id, post_body, hashtags, emoji"
sql_query = f"""
CREATE MATERIALIZED VIEW mviews.all_posts_de_berlin AS
SELECT {cols}
FROM mviews.{MVIEWS_REF} p1
WHERE
ST_Intersects(p1.post_latlng,
ST_GeographyFromText(
'SRID=4326;
{berlin_wkt}
'))
--LIMIT 10
;
"""
# samples = db_conn_hll.query(sql_query)
Since this is a long running query, you may want to directly connect to the postgres backend and issue the command below with a terminal multiplexer (e.g. byobu).
Code(sql_query, language='sql')
E.g.:
byobu
docker exec -it lbsn-hlldb /bin/bash
psql -h localhost -p 5432 -U postgres hlldb
...
[F6]
samples.head()
Create notebook HTML¶
!jupyter nbconvert --to html_toc \
--output-dir=../resources/html/ ./04_influence_sphere_cities.ipynb \
--template=../nbconvert.tpl \
--ExtractOutputPreprocessor.enabled=False >&- 2>&-