Exploration of geosocial patterns (Germany) ¶

Alexander Dunkel, Leibniz Institute of Ecological Urban and Regional Development,
Transformative Capacities & Research Data Centre (IÖR-FDZ)

No description has been provided for this image

This is a project where I explore base geosocial media patterns for Germany

Prepare environment¶

To run this notebook, as a starting point, you have two options:

1. Create an environment with the packages and versions shown in the following cell.

As a starting point, you may use the latest conda environment_default.yml from our CartoLab docker container.

2. If docker is available to you, we suggest to use the Carto-Lab Docker Container

Clone the repository and edit your .env value to point to the repsitory, where this notebook can be found, e.g.:

git clone https://gitlab.vgiscience.de/lbsn/tools/jupyterlab.git
cd jupyterlab
cp .env.example .env
nano .env
## Enter:
# JUPYTER_NOTEBOOKS=~/notebooks/geosocial_patterns_de
# TAG=v0.12.3
docker network create lbsn-network
docker-compose pull && docker-compose up -d

Load dependencies:

import os
from pathlib import Path
import geopandas as gp
import pandas as pd
import matplotlib.pyplot as plt
from typing import List, Tuple, Dict, Optional
from IPython.display import clear_output, display, HTML

Activate autoreload of changed python files:

%load_ext autoreload
%autoreload 2

Parameters¶

Define initial parameters that affect processing

OUTPUT = Path.cwd().parents[0] / "out"
OUTPUT.mkdir(exist_ok=True)

(Path.cwd().parents[0] / "notebooks").mkdir(exist_ok=True)
(Path.cwd().parents[0] / "py").mkdir(exist_ok=True)

CHUNK_SIZE = 5000000

Create notebook HTML¶

import numpy as np
import pandas as pd
import dask
import dask.dataframe as dd
import dask.diagnostics as diag
import datashader.transfer_functions as tf
import datashader as ds
from datashader.utils import lnglat_to_meters
from IPython.display import clear_output
from pathlib import Path

filename = Path.cwd().parents[0] / 'data' / '2024-07-31_DE_All_exportAllLatLng.csv'
dtypes = {'latitude': float, 'longitude': float}

%%time
parquet_output = OUTPUT / "twitter_proj.snappy.parq"
if not parquet_output.exists():
    iter_csv = pd.read_csv(
        filename, iterator=True,
        dtype=dtypes, encoding='utf-8', chunksize=CHUNK_SIZE)

    cnt = 0

    for ix, chunk in enumerate(iter_csv):
        # read
        append=True
        cnt += CHUNK_SIZE
        clear_output(wait=True)
        print(f"Processed {cnt:,.0f} coordinates..")
        if ix==0:
            if parquet_output.exists():
                break
            append=False
        dd_chunk = dd.from_pandas(chunk, npartitions=1)
        # project
        web_mercator_x, web_mercator_y = lnglat_to_meters(
            chunk['longitude'], chunk['latitude'])
        projected_coordinates = dd.concat(
            [web_mercator_x, web_mercator_y], axis=1)
        transformed = projected_coordinates.rename(
            columns={'longitude':'x', 'latitude': 'y'})
        # store
        dd.to_parquet(transformed, parquet_output, append=append, compression="SNAPPY")

CPU times: user 67 μs, sys: 11 μs, total: 78 μs
Wall time: 74.6 μs

datasize = sum(f.stat().st_size for f in parquet_output.glob('**/*') if f.is_file())/(1024*1024*1024)
print(
    f"Size: {datasize:,.1f} GB")

Size: 0.5 GB

df = dask.dataframe.read_parquet(parquet_output)
if datasize < 8:
    df = df.persist()

df.columns

Index(['x', 'y'], dtype='object')

df.head()

def bounds(x_range, y_range):
    x,y = lnglat_to_meters(x_range, y_range)
    return dict(x_range=x, y_range=y)

Earth       = ((-180.00, 180.00), (-59.00, 74.00))
France      = (( -12.00,  16.00), ( 41.26, 51.27))
Berlin      = (( 12.843018,  14.149704), ( 52.274880, 52.684292))
Dresden     = (( 13.415680,  14.703827), ( 50.740090, 51.194905))
USA         = (( -126,  -64), ( 24.92, 49.35))
Paris       = ((   2.05,   2.65), ( 48.76, 48.97))
DE          = ((   4.605469,   15.372070), ( 46.697243, 55.065885))

plot_width=15
plot_height=8
cvs = ds.Canvas(plot_width=plot_width, plot_height=plot_height, **bounds(*Earth))

with diag.ProgressBar(), diag.Profiler() as prof, diag.ResourceProfiler(0.5) as rprof:
    agg = cvs.points(df, x='x', y='y')

[########################################] | 100% Completed | 526.04 ms

def plot(x_range, y_range, plot_width: int = None, plot_height: int = None):
    """Plot df using tf-shade()
    Calculates aspect ratio based on 
    web mercator distance ratio lat/lng
    """
    if plot_width is None:
        plot_width = 1000
    lng_width = x_range[1]-x_range[0]
    lat_height = y_range[1]-y_range[0]
    plot_height = int(((plot_width * lat_height) / lng_width)*1.5)
    cvs = ds.Canvas(plot_width=plot_width, plot_height=plot_height, **bounds(x_range, y_range))
    with diag.ProgressBar(), diag.Profiler() as prof, diag.ResourceProfiler(0.5) as rprof:
        agg = cvs.points(df, x='x', y='y')
    return tf.shade(agg, cmap=["lightblue","darkblue"])

def save_image(img, output_name, return_img: bool = True, ):
    """Saves image as PNG"""
    ds.utils.export_image(
        img=img, filename= str(OUTPUT / output_name), fmt=".png", background='white')
    if return_img:
        return img

%time save_image(plot(*DE), output_name='DE_map')

[########################################] | 100% Completed | 102.99 ms
CPU times: user 3.38 s, sys: 231 ms, total: 3.61 s
Wall time: 3.3 s

%time  save_image(plot(*Berlin), output_name='Berlin_map')

[########################################] | 100% Completed | 103.13 ms
CPU times: user 473 ms, sys: 311 ms, total: 784 ms
Wall time: 663 ms

%time  save_image(plot(*Dresden), output_name='Dresden_map')

[########################################] | 100% Completed | 103.24 ms
CPU times: user 341 ms, sys: 369 ms, total: 710 ms
Wall time: 658 ms

!jupyter nbconvert --to html_toc \
    --output-dir=../resources/html/ ./01_overview_de.ipynb \
    --template=../nbconvert.tpl \
    --ExtractOutputPreprocessor.enabled=False >&- 2>&- # create single output file

	x	y
0	1.483521e+06	6.891437e+06
1	7.615500e+05	6.609943e+06
2	1.491849e+06	6.896486e+06
3	9.711956e+05	6.063937e+06
4	7.902371e+05	6.540994e+06