Python Performance – Solving Performance Issues with Average Pixel Values in Buffered Circles

numpyperformancepythonrasterio

I have a EPSG:4326 raster file with pixel size of 15 arcseconds, and a list of longitudes and latitudes. For each pair of coordinate, I want to create a buffered circle of X kilometers surrounding them, and aggregate over the pixel values within each circle.

Following this thread: Rasterio. Pixel value of a point X km away from current point:

  1. I have used Shapely to get a circle of defined radius from a centre point (after reprojecting the EPSG:4326 file to a projected coordinate system).
  2. Then, I use rasterio.features.geometry_mask to calculate a numpy mask given the Shapely circle.
  3. Finally, I calculate the average pixel value on the masked array with np.ma.mean

The problem is the np.ma.mean function is too slow for my numpy array (it has the shape of (34779, 45217), and dtype of float32), and it can take hours to perform the operation over my list of 5000 longtitudes and latitudes. Is there any other way? I am open to any tool, including Python, R, and any and all software.

Below is the annotated code for your reference.

from functools import partial
import pyproj
from shapely.geometry import Point
from shapely.ops import transform
from rasterio.features import geometry_mask
import rasterio
import pandas as pd
import numpy as np


sample = pd.read_csv(lat_lon_url) # read in dataframe that contains longtitude and latitude
raster = rasterio.open(raster_url)
band = raster.read(1)

''' 
apply the get_mean function for each row of the sample dataframe.
each row of the sample dataframe has a longitude and latitude. 
'''
sample['lightscore'] = sample.apply(lambda row: get_mean(row['longtitude'], row['latitude'], 
                                    raster.height, raster.width, raster.transform, 
                                    band, radius = 5000), axis = 1)                                    

Below is the get_mean function.

'''
get_mean function 
1. creates a circle with specified radius surrounding the point with longitude & latitude
2. aggregate over the circle to get average pixel value
'''

def get_mean(lon, lat,
             raster_height, raster_width, raster_transform,
             band, radius: float) -> float:

    '''
    Define the projected and geographic coordinate system
    '''
    local_azimuthal_projection = "+proj=aeqd +R=6371000 +units=m +lat_0={} +lon_0={}".format(
        lat, lon
    )
    wgs84 = "+proj=longlat +datum=WGS84 +no_defs"

    '''
    Define the projection.
    '''
    wgs84_to_aeqd = partial(
        pyproj.transform,
        pyproj.Proj(wgs84),
        pyproj.Proj(local_azimuthal_projection),
    )
    aeqd_to_wgs84 = partial(
        pyproj.transform,
        pyproj.Proj(local_azimuthal_projection),
        pyproj.Proj(wgs84),
    )


    center = Point(float(lon), float(lat))
    point_transformed = transform(wgs84_to_aeqd, center) # reproject the point from geographic to projected system
    buffer = point_transformed.buffer(radius)
    circle_poly = transform(aeqd_to_wgs84, buffer) # reproject the buffered circle from projected back to geographic system

    '''
    Calculate a numpy mask given the Shapely circle.
    '''
    circle_mask = geometry_mask(geometries = [circle_poly], 
                                out_shape = (raster_height, raster_width),
                                transform = raster_transform) # this takes a while for each record of my sample dataframe,
                                                              # but not as long as the np.ma.mean below               

    '''
    Create masked numpy array and aggregate over it
    '''                                            
    masked_data = np.ma.array(band, mask=circle_mask)
    mean = masked_data.mean() # this takes EXTREMELY LONG for each record of my sample dataframe.

    return mean

Best Answer

PyQGIS manages to perform this task significantly faster than numpy or rasterstats. Here is the link to the cookbook: https://docs.qgis.org/3.16/en/docs/pyqgis_developer_cookbook/index.html.

To complete the task:

... a EPSG:4326 raster file with pixel size of 15 arcseconds, and a list of longitudes and latitudes. For each pair of coordinate, ... create a buffered circle of X kilometers surrounding them, and aggregate over the pixel values within each circle.

You can use the below template:

from qgis.core import *
from qgis.PyQt.QtCore import QVariant
from qgis.analysis import QgsZonalStatistics

import pandas as pd

# Inits app
QgsApplication.setPrefixPath(path_to_qgis, True)
qgs = QgsApplication([], False)
qgs.initQgis()

# Sets up original CRS
epsg4326 = QgsCoordinateReferenceSystem("EPSG:4326")
transformContext = QgsProject.instance().transformContext()

# Creates shapefile to write buffer into
filename = "buffers.shp"

fields = QgsFields()
fields.append(QgsField(lon_col_name, QVariant.Double))
fields.append(QgsField(lat_col_name, QVariant.Double))

save_options = QgsVectorFileWriter.SaveVectorOptions()
save_options.driverName = "ESRI Shapefile"
save_options.fileEncoding = "UTF-8"

writer = QgsVectorFileWriter.create(
    filename, 
    fields,
    QgsWkbTypes.Polygon,
    epsg4326,
    transformContext,
    save_options
)

# Loads .csv file as vector layer
data_url = "link/to/csv/file/contains/lat/lon"
vlayer = QgsVectorLayer(data_url, "coordinates", "ogr")
features = vlayer.getFeatures()

field_names = [ field.name() for field in vlayer.fields() ]
lat_col_idx = field_names.index("lat")
lon_col_idx = field_names.index("lon")

for f in features: # each feature is a point
    attributes = f.attributes()
    lat = float(attributes[lat_col_idx])
    lon = float(attributes[lon_col_idx])

    # Sets up projected CRS and transformer
    local_azimuthal = QgsCoordinateReferenceSystem()
    proj4_str = "+proj=aeqd +R=6371000 +units=m +lat_0={} +lon_0={}".format(lat, lon)
    local_azimuthal.createFromProj(proj4_str)

    wgs84_to_azimuthal = QgsCoordinateTransform(epsg4326, local_azimuthal, transformContext)
    azimuthal_to_wgs84 = QgsCoordinateTransform(local_azimuthal, epsg4326, transformContext)

    # Projects the point, creates buffer, and reprojects to original CRS
    projected_point = wgs84_to_azimuthal.transform(QgsPointXY(lon, lat))
    point_geom = QgsGeometry.fromPointXY(projected_point)
    radius = 5000 # in meters
    segment = 18
    buffer = point_geom.buffer(radius, segment)
    buffer.transform(azimuthal_to_wgs84)

    # Writes buffer to file
    fet = QgsFeature()
    fet.setGeometry(buffer)
    fet.setAttributes([lon, lat])
    writer.addFeature(fet)

# flush file to avoid memory leak
del writer

# Reads in the buffer shape files
filename = "buffers.shp"
vlayer = QgsVectorLayer(filename, "buffers", "ogr")

# Calculates zonal mean-s for each raster
raster_url = "link/to/raster"
raster = QgsRasterLayer(raster_url)
zoneStats = QgsZonalStatistics(vlayer, raster, stats = QgsZonalStatistics.Statistics(QgsZonalStatistics.Mean))
zoneStats.calculateStatistics(None)

# Writes zonal mean-s to csv
destUrl = "link/to/destination/csv"
options = QgsVectorFileWriter.SaveVectorOptions()
options.driverName = "CSV"
QgsVectorFileWriter.writeAsVectorFormatV2(vlayer, destUrl, transformContext, options)

del vlayer

Thank you @StefanBrand for suggesting QGIS.