Topic Exploration: Startup
in Germany ¶
Alexander Dunkel, Leibniz Institute of Ecological Urban and Regional Development,
Transformative Capacities & Research Data Centre (IÖR-FDZ)
Following the ideas in [1], we study the occurence of the term startup
in Germany in different Geosocial Meda (Instagram Flickr, Twitter).
(1) Corradini, C.; Santini, E.; Vecciolini, C. Place Promotion, Place Branding and Social Media Communication around Entrepreneurial Ecosystems: A Twitter Analysis. Regional Studies 0 (0), 1–14. https://doi.org/10.1080/00343404.2023.2239275.
Prepare environment¶
To run this notebook, as a starting point, you have two options:
1. Create an environment with the packages and versions shown in the following cell.
As a starting point, you may use the latest conda environment_default.yml from our CartoLab docker container.
2. If docker is available to you, we suggest to use the Carto-Lab Docker Container
Clone the repository and edit your .env
value to point to the repsitory, where this notebook can be found, e.g.:
git clone https://gitlab.vgiscience.de/lbsn/tools/jupyterlab.git
cd jupyterlab
cp .env.example .env
nano .env
## Enter:
# JUPYTER_NOTEBOOKS=~/notebooks/geosocial_patterns_de
# TAG=v0.12.3
docker network create lbsn-network
docker-compose pull && docker-compose up -d
Load dependencies:
import os
import csv
import sys
import colorcet
import psycopg2 # Postgres API
import geoviews as gv
import holoviews as hv
import mapclassify as mc
import geopandas as gp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geoviews.feature as gf
from pathlib import Path
from collections import namedtuple
from pathlib import Path
from typing import List, Tuple, Dict, Optional
from pyproj import Transformer, CRS, Proj
from geoviews import opts
from shapely.geometry import shape, Point, Polygon
from shapely.ops import transform
from holoviews import dim
from cartopy import crs
from matplotlib import colors
from IPython.display import clear_output, display, HTML
from bokeh.models import HoverTool, FuncTickFormatter, FixedTicker
# optionally, enable shapely.speedups
# which makes some of the spatial
# queries running faster
import shapely.speedups as speedups
from shapely import geometry
Activate autoreload of changed python files:
%load_ext autoreload
%autoreload 2
Load helper module.
module_path = str(Path.cwd().parents[0] / "py")
if module_path not in sys.path:
sys.path.append(module_path)
from modules.base import tools, preparations
from modules.base import grid as gr
Initialize Bokeh and shapely.speedups
preparations.init_imports()
Parameters¶
Define initial parameters that affect processing
WORK_DIR = Path.cwd().parents[0] / "tmp" # Working directory
EPSG_CODE = 25832 # Target projection EPSG Code
CRS_PROJ = f"epsg:{EPSG_CODE}" # Target projection
CRS_WGS = "epsg:4326" # Input projection (Web Mercator)
OUTPUT = Path.cwd().parents[0] / "out" # Define path to output directory (figures etc.)
GRID_SIZE_METERS = 10000 # Define Grid size in Meters
WORK_DIR.mkdir(exist_ok=True)
tools.create_paths(OUTPUT)
Folder structure and input data¶
root = Path.cwd().parents[0] / "data"
tools.tree(Path.cwd().parents[0], ignore_files_folders=["out", "tmp"])
Connect to Database¶
from dotenv import load_dotenv
load_dotenv(
Path.cwd().parents[0] / '.env', override=True)
DB_NAME_RAWDB = os.getenv("DB_NAME_RAWDB") # lbsn-rawdb name
DB_HOST_RAWDB = os.getenv("DB_HOST_RAWDB") # lbsn-rawdb name
db_user = "postgres"
db_pass = os.getenv('POSTGRES_PASSWORD')
db_host = "hlldb"
db_port = "5432"
db_name = "hlldb"
db_connection_hll = psycopg2.connect(
host=db_host,
port=db_port,
dbname=db_name,
user=db_user,
password=db_pass
)
db_conn_hll = tools.DbConn(db_connection_hll)
cur_hll = db_connection_hll.cursor()
cur_hll.execute("SELECT 1;")
print(cur_hll.statusmessage)
Simplify query access:
db_conn = tools.DbConn(db_connection_hll)
db_conn.query("SELECT 1;")
Create Query Schema¶
Create a new schema called mviews and update Postgres search_path, to include new schema:
sql_query = """
CREATE SCHEMA IF NOT EXISTS mviews;
ALTER DATABASE hlldb
SET search_path = "$user",
social,
spatial,
temporal,
topical,
interlinkage,
extensions,
mviews;"""
Since the above query will not return any result, we'll directly use the psycopg2 cursor object:
cur = db_connection_hll.cursor()
cur.execute(sql_query)
print(cur.statusmessage)
By using Foreign Table, this step will establish the connection between hlldb to rawdb.
On hlldb, install postgres_fdw extension:
sql_query = """
CREATE EXTENSION IF NOT EXISTS postgres_fdw SCHEMA extensions;
"""
cur_hll.execute(sql_query)
print(cur_hll.statusmessage)
MVIEWS_REF = "spatiallatlng_raw_startup_de"
Check if foreign table has been imported already:
sql_query = f"""
SELECT EXISTS (
SELECT FROM information_schema.tables
WHERE table_schema = 'mviews'
AND table_name = '{MVIEWS_REF}'
);
"""
result = db_conn_hll.query(sql_query)
result["exists"][0]
Conditional load password - this only need to be done once, if the server hasn't been added before.
Enter the password for pg user lbsn_reader
USER_KEY = None
if not result["exists"][0]:
import getpass
USER_KEY = getpass.getpass()
Create Foreign Server connection to rawdb, on hlldb:
sql_query = f"""
CREATE SERVER IF NOT EXISTS lbsnraw
FOREIGN DATA WRAPPER postgres_fdw
OPTIONS (
host '{DB_NAME_RAWDB}',
dbname '{DB_HOST_RAWDB}',
port '5432',
keepalives '1',
keepalives_idle '30',
keepalives_interval '10',
keepalives_count '5',
fetch_size '500000');
CREATE USER MAPPING IF NOT EXISTS for postgres
SERVER lbsnraw
OPTIONS (user 'lbsn_reader', password '{USER_KEY}');
"""
if not result["exists"][0]:
cur_hll.execute(sql_query)
print(cur_hll.statusmessage)
Optional cleanup step:
result["exists"][0]Import foreign table definition on the hlldb.
sql_query = f"""
IMPORT FOREIGN SCHEMA mviews
LIMIT TO (
{MVIEWS_REF})
FROM SERVER lbsnraw
INTO mviews;
"""
# only import table
# if it hasn't been imported already
if not result["exists"][0]:
cur_hll.execute(sql_query)
print(cur_hll.statusmessage)
Raw->Hll Conversion¶
HyperLogLog parameters
The HyperLogLog extension for Postgres from Citus that we're using here, contains several tweaks, to optimize performance, that can affect sensitivity of data.
From a privacy perspective, for example, it is recommended to disable explicit mode.
Explicit mode? When explicit mode is active, full IDs will be stored for small sets. In our case, any coordinates frequented by few users (outliers) would store full user and post IDs.
To disable explicit mode:
db_conn_hll.query("SELECT hll_set_defaults(11, 5, 0, 1);")
Optional Cleanup Step
sql_query = f"""
DROP TABLE IF EXISTS spatial.latlng_startup;
"""
cur_hll.execute(sql_query)
print(cur_hll.statusmessage)
db_connection_hll.rollback()
sql_query = f"""
CREATE TABLE spatial.latlng_startup (
latitude float,
longitude float,
PRIMARY KEY (latitude, longitude),
latlng_geom geometry(Point, 4326) NOT NULL)
INHERITS (
social.user_hll, -- e.g. number of users/latlng (=upl)
topical.post_hll, -- e.g. number of posts/latlng
topical.utl_hll -- e.g. number of terms/latlng
);
"""
cur_hll.execute(sql_query)
print(cur_hll.statusmessage)
sql_query = f"""
INSERT INTO spatial.latlng_startup(
latitude,
longitude,
user_hll,
post_hll,
latlng_geom)
SELECT latitude,
longitude,
hll_add_agg(hll_hash_text(user_guid)) as user_hll,
hll_add_agg(hll_hash_text(post_guid)) as post_hll,
ST_SetSRID(
ST_MakePoint(longitude, latitude), 4326) as latlng_geom
FROM mviews.spatiallatlng_raw_startup_de
--LIMIT 100
GROUP BY latitude, longitude;
"""
cur_hll.execute(sql_query)
print(cur_hll.statusmessage)
df = db_conn_hll.query(
f"""
SELECT latitude,
longitude,
hll_cardinality(post_hll)::int as postcount,
hll_cardinality(user_hll)::int as usercount
FROM spatial.latlng_startup
ORDER BY usercount DESC
""")
df.head()
Store changes to DB:
db_connection_hll.commit()
Interactive visualization¶
points_lonlat = gv.Points(
df,
kdims=['longitude', 'latitude'],
vdims=['postcount', 'usercount'])
Define the bounding box, for the initial zoom level of the map:
LIM_LNG_MIN = 5.174561
LIM_LNG_MAX = 21.862794
LIM_LAT_MIN = 44.465151
LIM_LAT_MAX = 49.922936
from cartopy import crs as ccrs
def set_active_tool(plot, element):
"""Enable wheel_zoom in bokeh plot by default"""
plot.state.toolbar.active_scroll = plot.state.tools[0]
# project bbox coordinates
proj_transformer_mercator = Transformer.from_crs(
"epsg:4326", "epsg:3857", always_xy=True)
bb_bottomleft = proj_transformer_mercator.transform(
LIM_LNG_MIN, LIM_LAT_MIN)
bb_topright = proj_transformer_mercator.transform(
LIM_LNG_MAX, LIM_LAT_MAX)
gv_layers = hv.Overlay(
gv.tile_sources.EsriImagery * \
points_lonlat.opts(
tools=['hover'],
size=2+dim('usercount')/30,
line_color='black',
line_width=0.1,
fill_alpha=0.8,
fill_color='yellow')
)
layer_options = {
"projection":ccrs.GOOGLE_MERCATOR,
"title":'Estimate User Count for distinct startup mentions',
"responsive":True,
# "xlim":(bbox_bottomleft[0], bbox_topright[0]),
# "ylim":(bbox_bottomleft[1], bbox_topright[1]),
"data_aspect":0.45, # maintain fixed aspect ratio during responsive resize
"hooks":[set_active_tool]
}
gv_layers.opts(**layer_options)
Store to separate standalone HTML:
layer_options["data_aspect"] = None
hv.save(
gv_layers.opts(**layer_options), OUTPUT / f'geoviews_map_startup.html', backend='bokeh')
Create notebook HTML¶
!jupyter nbconvert --to html_toc \
--output-dir=../resources/html/ ./02_grid_exploration.ipynb \
--template=../nbconvert.tpl \
--ExtractOutputPreprocessor.enabled=False >&- 2>&-