!pip install sodapy

Requirement already satisfied: sodapy in c:\users\seism\miniconda3\envs\earth-analytics-python\lib\site-packages (2.2.0)
Requirement already satisfied: requests>=2.28.1 in c:\users\seism\miniconda3\envs\earth-analytics-python\lib\site-packages (from sodapy) (2.32.5)
Requirement already satisfied: charset_normalizer<4,>=2 in c:\users\seism\miniconda3\envs\earth-analytics-python\lib\site-packages (from requests>=2.28.1->sodapy) (3.4.3)
Requirement already satisfied: idna<4,>=2.5 in c:\users\seism\miniconda3\envs\earth-analytics-python\lib\site-packages (from requests>=2.28.1->sodapy) (3.10)
Requirement already satisfied: urllib3<3,>=1.21.1 in c:\users\seism\miniconda3\envs\earth-analytics-python\lib\site-packages (from requests>=2.28.1->sodapy) (2.5.0)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\seism\miniconda3\envs\earth-analytics-python\lib\site-packages (from requests>=2.28.1->sodapy) (2025.8.3)

### Import libraries

### file paths and organization
import os
import pathlib
from pathlib import Path

### see warnings
import warnings

### deal with different data types
import geopandas as gpd
import numpy as np
import pandas as pd
import xarray as xr
import rioxarray as rxr
import rioxarray.merge as rxrmerge
import shapely
from shapely.geometry import box
import time

### plotting and visualization
import geoviews as gv
import holoviews as hv
import hvplot.pandas
import hvplot.xarray
from cartopy import crs as ccrs

### data exploration and visualiation
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as stats

### image processing
from scipy.ndimage import convolve
from scipy.ndimage import label

### working with data in the cloud
import pystac_client
from sodapy import Socrata

### progress bar
from tqdm.notebook import tqdm

### OLS sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

import json

### Create reproducible file paths
data_dir = os.path.join(

    ### home directory
    pathlib.Path.home(),

    ### Earth analytics folder
    'earth-analytics',
    'data',

    ## project directory
    'urban-green-big-data2',
)

### make the dir
os.makedirs(data_dir, exist_ok = True)
print(data_dir)

C:\Users\seism\earth-analytics\data\urban-green-big-data2

### revent GDAL from quitting due to momentary disruptions
os.environ["GDAL_HTTP_MAX_RETRY"] = "5"
os.environ["GDAL_HTTP_RETRY_DELAY"] = "1"

### Sanity check
# tract_gdf.columns

### You were not kidding when you said it was difficult to get Portland Oregon
# tract_gdf["ST"].value_counts().head(20)
# tract_gdf.loc[tract_gdf["PlaceName"].astype(str).str.contains("Portland", na=False), ["PlaceName","ST"]].drop_duplicates()

### Set up the census tract path

### define directory
tract_dir = os.path.join(data_dir, 'portland-tract')

### make the directory
os.makedirs(tract_dir, exist_ok = True)

### make path to directory
tract_path = os.path.join(tract_dir, 'portland-tract.shp')

### Download the census tracts (only once)
if not os.path.exists(tract_path):

    ### put in census url
    tract_url = ('https://data.cdc.gov/download/x7zy-2xmx/application%2Fzip')

    ### read the shp file
    tract_gdf = gpd.read_file(tract_url)

    ### subset to Portland
    por_tract_gdf = tract_gdf[
    (tract_gdf["PlaceName"] == "Portland") &
    (tract_gdf["ST"].astype(str).str.zfill(2) == "41")
    ].copy()

    ### save it as a file
    por_tract_gdf.to_file(tract_path, index = False)

por_tract_gdf.shape
por_tract_gdf[["PlaceName","ST"]].drop_duplicates()
por_tract_gdf.geometry.isna().sum(), len(por_tract_gdf)

(np.int64(0), 161)

### Load in census tract data
por_tract_gdf = gpd.read_file(tract_path)
por_tract_gdf


### Site plot -- Census tracts with satellite imagery in the background
(
    por_tract_gdf

    ### set projection
    .to_crs(ccrs.Mercator())

    ### plot with satellite imagery in the background
    .hvplot(
        line_color = 'orange', fill_color = None,
        crs = ccrs.Mercator(), tiles = 'EsriImagery',
        frame_width = 600
    )
)

### debugging
# tmp = pd.read_csv(cdc_url)
# tmp.columns

### Set up a path for the asthma data
cdc_path = os.path.join(data_dir, 'asthma.csv')

### Download asthma data (only once)
if not os.path.exists(cdc_path):

    cdc_url = (

        ### url for data
        "https://data.cdc.gov/resource/cwsq-ngmh.csv"

        ### parameter to filter on
        "?year=2023"
        "&stateabbr=OR"
        "&countyname=Multnomah"
        "&measureid=CASTHMA"
        "&$limit=1500"
    )

    cdc_df = (

        ### open it as a csv
        pd.read_csv(cdc_url)

        ### rename the cols
        .rename(columns = {
            'data_value': 'asthma',
            'low_confidence_limit' : 'asthma_ci_low',
            'high_confidence_limit': 'asthma_ci_high',
            'locationid': 'tract'
        })


        ### select columns I want to keep
        [[
            'year', 'tract', 
            'asthma', 'asthma_ci_low', 'asthma_ci_high',
            'data_value_unit', 'totalpopulation',
            'totalpop18plus'
        ]]
    )

    cdc_df['tract'] = cdc_df['tract'].astype(str)
    
    ### download to a csv
    cdc_df.to_csv(cdc_path, index = False)


### Load in asthma data
cdc_df = pd.read_csv(cdc_path, dtype={"tract": str})

### Preview asthma data
cdc_df

### Sanity check
cdc_df["tract"].head()
cdc_df["tract"].str.len().value_counts()

tract
11    197
Name: count, dtype: int64

### Sanity check
por_tract_gdf["tract2010"].astype(str).str.len().value_counts()

tract2010
11    161
Name: count, dtype: int64

### Change tract identifier datatype for merging
por_tract_gdf["tract2010"] = por_tract_gdf["tract2010"].astype(str)
cdc_df["tract"] = cdc_df["tract"].astype(str)

### merge census data with geometry
tract_cdc_gdf = (
    por_tract_gdf
    .merge(cdc_df,
           
           ### specify cols to merge on
           left_on = 'tract2010',
           right_on = 'tract',

           ### use inner join
           how = 'inner'
           )
)
# tract_cdc_gdf
### Plot asthma data as chloropleth
(
    ### load basemap with satellite imagery
    gv.tile_sources.EsriImagery

    *

    ### map census tracts with asthma data
    gv.Polygons(

        ### reproject to align CRSs
        tract_cdc_gdf.to_crs(ccrs.Mercator()),

        ### select cols
        vdims = ['asthma', 'tract2010'],

        ### ensure CRSs align
        crs = ccrs.Mercator()
    ).opts(color = 'asthma', colorbar = True, tools = ['hover'], colorbar_opts={
            'title': 'Asthma prevalence (%)'})
).opts(width = 600, height = 600, xaxis = None, yaxis = None, title='Asthma Prevalence by Census Tract (Portland)')

### generate for the portfolio
asthma_map = (
    gv.tile_sources.EsriImagery
    *
    gv.Polygons(
        tract_cdc_gdf.to_crs(ccrs.Mercator()),
        vdims=['asthma', 'tract2010'],
        crs=ccrs.Mercator()
    ).opts(
        color='asthma',
        colorbar=True,
        tools=['hover'],
        colorbar_opts={'title': 'Asthma prevalence (%)'}
    )
).opts(
    width=500, height=500,
    xaxis=None, yaxis=None,
    title='Asthma Prevalence by Census Tract (Portland)'
)

asthma_map

hv.save(asthma_map, "portland_asthma_map.html")

### Connect to the planetary computer catalog
e84_catalog = pystac_client.Client.open(
    "https://planetarycomputer.microsoft.com/api/stac/v1"
)

### Convert geometry to lat/lon for STAC
tract_latlong_gdf = tract_cdc_gdf.to_crs(4326)

### Define a path to save NDVI stats
ndvi_stats_path = os.path.join(data_dir, 'portland-ndvi-stats.csv')

### Check for existing data - do not access duplicate tracts

### initialize an empty list for the census tract IDs
downloaded_tracts = []

### write the conditional
if os.path.exists(ndvi_stats_path):

    ### if it exists, open the csvs in the path
    ndvi_stats_df = pd.read_csv(ndvi_stats_path)

    ### fill the list with the tract values
    downloaded_tracts = ndvi_stats_df.tract.values

### if it doesn't exist, print statement
else:
    print('No census tracts downloaded so far')

No census tracts downloaded so far

print("tract_latlong_gdf rows:", len(tract_latlong_gdf))
print("geometry missing:", tract_latlong_gdf.geometry.isna().sum())
print("CRS:", tract_latlong_gdf.crs)

tract_latlong_gdf rows: 125
geometry missing: 0
CRS: EPSG:4326

print("ndvi_stats_path exists:", os.path.exists(ndvi_stats_path))
print("downloaded_tracts count:", len(downloaded_tracts))
print("downloaded_tracts sample:", downloaded_tracts[:5] if len(downloaded_tracts) else [])

ndvi_stats_path exists: False
downloaded_tracts count: 0
downloaded_tracts sample: []

### run 1 iteration
i = 0
tract_values = tract_latlong_gdf.iloc[i]
tract = tract_values.tract2010
tract

'41051000301'

### check if stats already downloaded for this tract
if not (tract in downloaded_tracts):
    print('proceed with the STAC search')

proceed with the STAC search

### define naip_search
naip_search = e84_catalog.search(

    ### we want naip data
    collections = ['naip'],

    ### 2021 naip imagery
    intersects = shapely.to_geojson(tract_values.geometry), 
    datetime = '2021'
)

### Sanity check
items = list(naip_search.items())
len(items)

0

### Concerned that NAIP acquisition for Oregon might have been skipped in 2021, does NAIP return anything if I remove the year?

geom = tract_latlong_gdf.geometry.iloc[0]
geom_geojson = json.loads(shapely.to_geojson(geom))  # <-- dict, not a string

search = e84_catalog.search(
    collections=["naip"],
    intersects=geom_geojson,
    # datetime omitted
    limit=5
)

items = list(search.items())
print("items:", len(items))
if items:
    print("example datetime:", items[0].datetime)
    print("example id:", items[0].id)

items: 12
example datetime: 2022-07-11 16:00:00+00:00
example id: or_m_4512236_nw_10_030_20220711

### Naip_search
list(naip_search.items())[:2]

### build a dataframe
scene_df = pd.DataFrame(dict(

    ### tract column
    tract = tract,

    ### date column
    data = [pd.to_datetime(scene.datetime).date()
            
            ### return the scenes (STAC items)
            for scene in naip_search.items()], 

    ### make rgbir_href
    rgbir_href = [scene.assets['image'].href for scene in naip_search.items()],

))

scene_df

### loop through all the tracts

### Convert geometry to lat/lon for STAC
tract_latlong_gdf = tract_cdc_gdf.to_crs(4326)

### Define a path to save NDVI stats
ndvi_stats_path = os.path.join(data_dir, 'portland-ndvi-stats.csv')

### Check for existing data - do not access duplicate tracts
### initialize an empty list for the census tract IDs
downloaded_tracts = []

### write the conditional
if os.path.exists(ndvi_stats_path):

    ### if it exists, open the csvs in the path
    ndvi_stats_df = pd.read_csv(ndvi_stats_path)

    ### fill the list with the tract values
    downloaded_tracts = ndvi_stats_df.tract.values

### if it doesn't exist, print statement
else:
    print('No census tracts downloaded so far')


### Loop through each census tract
### list to accumulate into
scene_dfs = []

print("tracts to process:", len(tract_latlong_gdf))
print("already downloaded:", len(downloaded_tracts))
print("sample tracts:", tract_latlong_gdf["tract2010"].head().tolist())

### loop through each tract value in the gdf
for i, tract_values in tqdm(tract_latlong_gdf.iterrows()):

    ### for i, extract the tract id
    tract = tract_values.tract2010

    ### Check if statistics are already downloaded for this tract
    if not (tract in downloaded_tracts):

        ### Repeat up to 5 times in case of a momentary disruption 
        i = 0 ### initialize retry counter
        retry_limit = 5 ### max number of tries
        while i < retry_limit:

            ### Try accessing the STAC
            try:

                ### Search for tiles 
                naip_search = e84_catalog.search(
                    collections = ['naip'],
                    intersects = shapely.to_geojson(tract_values.geometry),
                    datetime="2022"
                )

                ### Build dataframe with tracts and tile urls
                scene_dfs.append(pd.DataFrame(dict(

                    ### column called tract
                    tract = tract,
                    date = [pd.to_datetime(scene.datetime).date()
                            
                            for scene in naip_search.items()],

                    rgbir_href = [scene.assets['image'].href for scene in naip_search.items()],
                    
                )))

                ### exit retry loop open STAC request succeeds
                break

            ### Try again in case of an APIError
            except pystac_client.exceptions.APIError:
                print(
                    f'Could not connect with STAC server'
                    f'Retrying tract {tract}...')

                ### wait 2 seconds before retrying
                time.sleep(2)
                i += 1
                continue        

### Concatenate the url dataframes
if scene_dfs:

    ### concatenate
    scene_df = pd.concat(scene_dfs).reset_index(drop = True)

### otherwise, tell me it didn't work
else:
    scene_df = None

### Preview the url dataframe
scene_df

No census tracts downloaded so far
tracts to process: 125
already downloaded: 0
sample tracts: ['41051000301', '41051000302', '41051000401', '41051000402', '41051000501']

0it [00:00, ?it/s]

# write out scene_df as a csv (only if we actually built one)
if scene_df is not None and not scene_df.empty:
    scene_df.to_csv(ndvi_stats_path, index=False)
else:
    print("No new tracts to write (scene_df is empty/None).")

### Sanity check
print("file exists:", os.path.exists(ndvi_stats_path))
if os.path.exists(ndvi_stats_path):
    import os
    print("file size (bytes):", os.path.getsize(ndvi_stats_path))

file exists: True
file size (bytes): 33343

### open df of NAIP imagery URLs
scene_df = pd.read_csv(ndvi_stats_path)
scene_df

### Pull out first tract value
# tract = por_tract_gdf.loc[0].tract2010
# tract
### use a tract that we know has imagery
tract = scene_df["tract"].astype(str).iloc[0]
tract

### convert tract to string
tract = str(tract)
tract

'41051000301'

### Sanity check
print("tract value:", tract)
print("tract type:", type(tract))
print("scene_df tract dtype:", scene_df["tract"].dtype)
print("scene_df tract sample:", scene_df["tract"].head().tolist())

tract value: 41051000301
tract type: <class 'str'>
scene_df tract dtype: int64
scene_df tract sample: [41051000301, 41051000301, 41051000302, 41051000302, 41051000401]

### grab first row of scene_df dataframe, select image column for matching rows
href_value = scene_df.loc[
    scene_df.tract.astype(str) == tract,

    ### column to extract
    "rgbir_href"
].iloc[0]

href_value

'https://naipeuwest.blob.core.windows.net/naip/v002/or/2022/or_030cm_2022/45122/m_4512236_nw_10_030_20220711.tif'

### process one scene
### open the raster
tile_da = rxr.open_rasterio(

    ### give it the url we want
    href_value,

    ### mask out NoData pixels
    masked = True

### remove dimensions of length = 1
).squeeze()

tile_da

<xarray.DataArray (band: 4, y: 24560, x: 17740)> Size: 7GB
[1742777600 values with dtype=float32]
Coordinates:
  * band         (band) int64 32B 1 2 3 4
  * x            (x) float64 142kB 5.291e+05 5.291e+05 ... 5.344e+05 5.344e+05
  * y            (y) float64 196kB 5.039e+06 5.039e+06 ... 5.031e+06 5.031e+06
    spatial_ref  int64 8B 0
Attributes:
    TIFFTAG_XRESOLUTION:     1
    TIFFTAG_YRESOLUTION:     1
    TIFFTAG_RESOLUTIONUNIT:  1 (unitless)
    AREA_OR_POINT:           Area
    scale_factor:            1.0
    add_offset:              0.0

[1742777600 values with dtype=float32]

array([1, 2, 3, 4])

array([529098.15, 529098.45, 529098.75, ..., 534419.25, 534419.55, 534419.85],
      shape=(17740,))

array([5038787.85, 5038787.55, 5038787.25, ..., 5031420.75, 5031420.45,
       5031420.15], shape=(24560,))

array(0)

PandasIndex(Index([1, 2, 3, 4], dtype='int64', name='band'))

PandasIndex(Index([        529098.15, 529098.4500000001,         529098.75,
               529099.05,         529099.35,         529099.65,
       529099.9500000001,         529100.25,         529100.55,
               529100.85,
       ...
               534417.15, 534417.4500000001,         534417.75,
               534418.05,         534418.35,         534418.65,
       534418.9500000001,         534419.25,         534419.55,
               534419.85],
      dtype='float64', name='x', length=17740))

PandasIndex(Index([       5038787.85,        5038787.55,        5038787.25,
       5038786.949999999, 5038786.649999999,        5038786.35,
              5038786.05,        5038785.75, 5038785.449999999,
       5038785.149999999,
       ...
              5031422.85,        5031422.55,        5031422.25,
       5031421.949999999, 5031421.649999999,        5031421.35,
              5031421.05,        5031420.75, 5031420.449999999,
       5031420.149999999],
      dtype='float64', name='y', length=24560))

### check out dimensions
tile_da.dims

('band', 'y', 'x')

print(scene_df.columns)

Index(['tract', 'date', 'rgbir_href'], dtype='object')

tract_cdc_gdf

tract_cdc_gdf["tract2010"].head()

0    41051000301
1    41051000302
2    41051000401
3    41051000402
4    41051000501
Name: tract2010, dtype: object

### Sanity check
tract in tract_cdc_gdf["tract2010"].astype(str).values

True

### get boundary of the tract we're working with
boundary = (
    tract_cdc_gdf

    ### deal with issues with integers
    .assign(tract2010 = lambda df: df["tract2010"].astype(str))

    ### use tract ID as the index
    .set_index("tract2010")

    ### grab the tract we are using
    .loc[[str(tract)]]

    ### make sure the CRS matc
    .to_crs(tile_da.rio.crs)

    ### grab geometry
    .geometry
)

boundary

tract2010
41051000301    POLYGON ((529476.134 5037488.683, 529554.509 5...
Name: geometry, dtype: geometry

### crop raster to the bounding box
crop_da = tile_da.rio.clip_box(

    ### get coordinates of the bounding box
    *boundary.envelope.total_bounds,

    ### let it expand a little
    auto_expand = True
)

crop_da

<xarray.DataArray (band: 4, y: 6054, x: 2115)> Size: 205MB
[51216840 values with dtype=float32]
Coordinates:
  * band         (band) int64 32B 1 2 3 4
  * x            (x) float64 17kB 5.291e+05 5.291e+05 ... 5.297e+05 5.297e+05
  * y            (y) float64 48kB 5.038e+06 5.038e+06 ... 5.036e+06 5.036e+06
    spatial_ref  int64 8B 0
Attributes:
    TIFFTAG_XRESOLUTION:     1
    TIFFTAG_YRESOLUTION:     1
    TIFFTAG_RESOLUTIONUNIT:  1 (unitless)
    AREA_OR_POINT:           Area
    scale_factor:            1.0
    add_offset:              0.0

[51216840 values with dtype=float32]

array([1, 2, 3, 4])

array([529098.15, 529098.45, 529098.75, ..., 529731.75, 529732.05, 529732.35],
      shape=(2115,))

array([5037519.75, 5037519.45, 5037519.15, ..., 5035704.45, 5035704.15,
       5035703.85], shape=(6054,))

array(0)

PandasIndex(Index([1, 2, 3, 4], dtype='int64', name='band'))

PandasIndex(Index([        529098.15, 529098.4500000001,         529098.75,
               529099.05,         529099.35,         529099.65,
       529099.9500000001,         529100.25,         529100.55,
               529100.85,
       ...
               529729.65, 529729.9500000001,         529730.25,
               529730.55,         529730.85,         529731.15,
       529731.4500000001,         529731.75,         529732.05,
               529732.35],
      dtype='float64', name='x', length=2115))

PandasIndex(Index([       5037519.75, 5037519.449999999, 5037519.149999999,
              5037518.85,        5037518.55,        5037518.25,
       5037517.949999999, 5037517.649999999,        5037517.35,
              5037517.05,
       ...
              5035706.55,        5035706.25, 5035705.949999999,
       5035705.649999999,        5035705.35,        5035705.05,
              5035704.75, 5035704.449999999, 5035704.149999999,
              5035703.85],
      dtype='float64', name='y', length=6054))

### clip to exact geometry of the census tract
clip_da = crop_da.rio.clip(

    boundary,

    ### include all pixels touching the boundary
    all_touched = True)

clip_da

<xarray.DataArray (band: 4, y: 4117, x: 2115)> Size: 139MB
array([[[ nan,  nan,  nan, ..., 200., 200., 201.],
        [ nan,  nan,  nan, ..., 192., 189., 189.],
        [ nan,  nan,  nan, ..., 156., 157.,  nan],
        ...,
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan]],

       [[ nan,  nan,  nan, ..., 204., 207., 207.],
        [ nan,  nan,  nan, ..., 194., 197., 194.],
        [ nan,  nan,  nan, ..., 154., 154.,  nan],
        ...,
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan]],

       [[ nan,  nan,  nan, ..., 195., 199., 199.],
        [ nan,  nan,  nan, ..., 185., 188., 186.],
        [ nan,  nan,  nan, ..., 146., 147.,  nan],
        ...,
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan]],

       [[ nan,  nan,  nan, ..., 174., 180., 180.],
        [ nan,  nan,  nan, ..., 164., 169., 167.],
        [ nan,  nan,  nan, ..., 132., 133.,  nan],
        ...,
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan]]],
      shape=(4, 4117, 2115), dtype=float32)
Coordinates:
  * band         (band) int64 32B 1 2 3 4
  * x            (x) float64 17kB 5.291e+05 5.291e+05 ... 5.297e+05 5.297e+05
  * y            (y) float64 33kB 5.037e+06 5.037e+06 ... 5.036e+06 5.036e+06
    spatial_ref  int64 8B 0
Attributes:
    TIFFTAG_XRESOLUTION:     1
    TIFFTAG_YRESOLUTION:     1
    TIFFTAG_RESOLUTIONUNIT:  1 (unitless)
    AREA_OR_POINT:           Area
    scale_factor:            1.0
    add_offset:              0.0

array([[[ nan,  nan,  nan, ..., 200., 200., 201.],
        [ nan,  nan,  nan, ..., 192., 189., 189.],
        [ nan,  nan,  nan, ..., 156., 157.,  nan],
        ...,
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan]],

       [[ nan,  nan,  nan, ..., 204., 207., 207.],
        [ nan,  nan,  nan, ..., 194., 197., 194.],
        [ nan,  nan,  nan, ..., 154., 154.,  nan],
        ...,
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan]],

       [[ nan,  nan,  nan, ..., 195., 199., 199.],
        [ nan,  nan,  nan, ..., 185., 188., 186.],
        [ nan,  nan,  nan, ..., 146., 147.,  nan],
        ...,
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan]],

       [[ nan,  nan,  nan, ..., 174., 180., 180.],
        [ nan,  nan,  nan, ..., 164., 169., 167.],
        [ nan,  nan,  nan, ..., 132., 133.,  nan],
        ...,
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan],
        [ nan,  nan,  nan, ...,  nan,  nan,  nan]]],
      shape=(4, 4117, 2115), dtype=float32)

array([1, 2, 3, 4])

array([529098.15, 529098.45, 529098.75, ..., 529731.75, 529732.05, 529732.35],
      shape=(2115,))

array([5037490.05, 5037489.75, 5037489.45, ..., 5036255.85, 5036255.55,
       5036255.25], shape=(4117,))

array(0)

PandasIndex(Index([1, 2, 3, 4], dtype='int64', name='band'))

PandasIndex(Index([        529098.15, 529098.4500000001,         529098.75,
               529099.05,         529099.35,         529099.65,
       529099.9500000001,         529100.25,         529100.55,
               529100.85,
       ...
               529729.65, 529729.9500000001,         529730.25,
               529730.55,         529730.85,         529731.15,
       529731.4500000001,         529731.75,         529732.05,
               529732.35],
      dtype='float64', name='x', length=2115))

PandasIndex(Index([       5037490.05,        5037489.75, 5037489.449999999,
       5037489.149999999,        5037488.85,        5037488.55,
              5037488.25, 5037487.949999999, 5037487.649999999,
              5037487.35,
       ...
       5036257.949999999, 5036257.649999999,        5036257.35,
              5036257.05,        5036256.75, 5036256.449999999,
       5036256.149999999,        5036255.85,        5036255.55,
              5036255.25],
      dtype='float64', name='y', length=4117))

clip_da.shape

(4, 4117, 2115)

### do NDVI math
ndvi_da = (
    (clip_da.sel(band = 4) - clip_da.sel(band = 1))
    / (clip_da.sel(band = 4) + clip_da.sel(band = 1))
)

ndvi_da

<xarray.DataArray (y: 4117, x: 2115)> Size: 35MB
array([[        nan,         nan,         nan, ..., -0.06951872,
        -0.05263158, -0.05511811],
       [        nan,         nan,         nan, ..., -0.07865169,
        -0.05586592, -0.06179775],
       [        nan,         nan,         nan, ..., -0.08333334,
        -0.08275862,         nan],
       ...,
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan]], shape=(4117, 2115), dtype=float32)
Coordinates:
  * x            (x) float64 17kB 5.291e+05 5.291e+05 ... 5.297e+05 5.297e+05
  * y            (y) float64 33kB 5.037e+06 5.037e+06 ... 5.036e+06 5.036e+06
    spatial_ref  int64 8B 0

array([[        nan,         nan,         nan, ..., -0.06951872,
        -0.05263158, -0.05511811],
       [        nan,         nan,         nan, ..., -0.07865169,
        -0.05586592, -0.06179775],
       [        nan,         nan,         nan, ..., -0.08333334,
        -0.08275862,         nan],
       ...,
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan],
       [        nan,         nan,         nan, ...,         nan,
                nan,         nan]], shape=(4117, 2115), dtype=float32)

array([529098.15, 529098.45, 529098.75, ..., 529731.75, 529732.05, 529732.35],
      shape=(2115,))

array([5037490.05, 5037489.75, 5037489.45, ..., 5036255.85, 5036255.55,
       5036255.25], shape=(4117,))

array(0)

PandasIndex(Index([        529098.15, 529098.4500000001,         529098.75,
               529099.05,         529099.35,         529099.65,
       529099.9500000001,         529100.25,         529100.55,
               529100.85,
       ...
               529729.65, 529729.9500000001,         529730.25,
               529730.55,         529730.85,         529731.15,
       529731.4500000001,         529731.75,         529732.05,
               529732.35],
      dtype='float64', name='x', length=2115))

PandasIndex(Index([       5037490.05,        5037489.75, 5037489.449999999,
       5037489.149999999,        5037488.85,        5037488.55,
              5037488.25, 5037487.949999999, 5037487.649999999,
              5037487.35,
       ...
       5036257.949999999, 5036257.649999999,        5036257.35,
              5036257.05,        5036256.75, 5036256.449999999,
       5036256.149999999,        5036255.85,        5036255.55,
              5036255.25],
      dtype='float64', name='y', length=4117))

### Skip this step if data are already downloaded 
if not scene_df is None:

    ### Get an example 
    ### pull out te identifier for an example tract
    # tract = por_tract_gdf.loc[0].tract2010
    scene_df["tract"] = scene_df["tract"].astype(str)
    tract = scene_df["tract"].iloc[0]

    ### subset scene_df to only include that tract
    # ex_scene_gdf = scene_df[scene_df.tract == tract]
    ex_scene_gdf = scene_df[scene_df["tract"] == tract]

    ### make a list to accumulate into
    tile_das = []

    ### loop through all the images for this tract
    for _, href_s in ex_scene_gdf.iterrows():

        ### open vsi connection to data
        tile_da = rxr.open_rasterio(

            ### location of the raster
            href_s.rgbir_href,

            ### deal with no data, remove extra dimension
            masked = True).squeeze()

        ### Crop data to the bounding box of the census tract
        ### make the bounding box
        boundary = (
            tract_cdc_gdf

            ### use tract id as index
            # .set_index('tract2010')
            .assign(tract2010=lambda df: df["tract2010"].astype(str))
            .set_index("tract2010")

            ### select geometry for the tract
            .loc[[tract]]

            ### reproject
            .to_crs(tile_da.rio.crs)

            ### extract geometry
            .geometry
        )

        ### crop to bounding box
        crop_da = tile_da.rio.clip_box(
 
            ### get coordinates for the bounding box
            *boundary.envelope.total_bounds,

            ### let expand
            auto_expand = True
        )

        ### Clip data to the boundary of the census tract
        clip_da = crop_da.rio.clip(
            boundary, all_touched = True)

        ### Compute NDVI
        ndvi_da = (
            (clip_da.sel(band = 4) - clip_da.sel(band = 1))
            / (clip_da.sel(band = 4) + clip_da.sel(band = 1))
        )

        ### Accumulate result
        tile_das.append(ndvi_da)

### check
print(f"Number of tiles in tiles_das: {len(tile_das)}")

### Merge data
scene_da = rxrmerge.merge_arrays(tile_das)
scene_da

### Mask vegetation
veg_mask = (scene_da > .3)


### Calculate mean patch size
labeled_patches, num_patches = label(veg_mask)
patch_sizes = np.bincount(labeled_patches.ravel())[1:]
mean_patch_size = patch_sizes.mean()

patch_sizes

Number of tiles in tiles_das: 2

array([10995,  3595,     1, ...,     4,     1,     1], shape=(65701,))

### Sanity check
print("ex_scene_gdf rows:", len(ex_scene_gdf))
print("tiles created:", len(tile_das))

ex_scene_gdf rows: 2
tiles created: 2

mean_patch_size

np.float64(130.32430252203162)

### Make kernel to calculate edge density
kernel = np.array([
    [1, 1, 1], 
    [1, -8, 1], 
    [1, 1, 1]])

### Calculate edge density
edges = convolve(veg_mask, kernel, mode='constant')
edge_density = np.sum(edges != 0) / veg_mask.size

edge_density

np.float64(0.1273147138822625)

### make csv for ndvi data
ndvi_stats_path_veg = os.path.join(data_dir, 'portland-ndvi-stats-veg-test.csv')

### debug
print("crop shape:", crop_da.shape)
print("clip shape:", clip_da.shape)

crop shape: (4, 6054, 5586)
clip shape: (4, 6054, 5586)

### debug
area_m2 = (
    tract_cdc_gdf
    .assign(tract2010=lambda d: d["tract2010"].astype(str))
    .set_index("tract2010")
    .loc[[str(tract)]]
    .to_crs(3857)
    .geometry.area
)
print("tract area m^2:", float(area_m2))

tract area m^2: 4736442.152396669

C:\Users\seism\AppData\Local\Temp\ipykernel_99948\219635894.py:10: FutureWarning: Calling float on a single element Series is deprecated and will raise a TypeError in the future. Use float(ser.iloc[0]) instead
  print("tract area m^2:", float(area_m2))

### Skip this step if data are already downloaded 
if not scene_df is None:

    scene_df["tract"] = scene_df["tract"].astype(str)
    tract_cdc_gdf["tract2010"] = tract_cdc_gdf["tract2010"].astype(str)
    
    ### Loop through the census tracts with URLs
    for tract, tract_date_gdf in tqdm(scene_df.groupby('tract')):

        ### Open all images for tract
        ### create a list to store NDVI arrays, with 1 array per NAIP image
        tile_das = []

        ### iterate over rows in tract-specific datafram
        for _, href_s in tract_date_gdf.iterrows():

            ### Open vsi connection to data
            tile_da = rxr.open_rasterio(

                ### mask and squeeze
                href_s.rgbir_href, masked = True).squeeze()
            
            ### Clip data
            ### get tract boundary
            boundary = (
                tract_cdc_gdf
                .set_index("tract2010")
                .loc[[tract]]
                .to_crs(tile_da.rio.crs)
                .geometry
            )

            ### crop to bounding box
            crop_da = tile_da.rio.clip_box(
                *boundary.envelope.total_bounds,
                auto_expand = True
            )

            ### downsample to fix out of memory error
            crop_da = crop_da.rio.reproject(crop_da.rio.crs, resolution=10)

            ### clip to actual tract geometry
            # clip_da = crop_da.rio.clip(boundary, all_touched = True)
            clip_da = crop_da.rio.clip([boundary.iloc[0]], all_touched=True)
                
            ### Compute NDVI
            # ndvi_da = ((clip_da.sel(band = 4) - clip_da.sel(band = 1))
            # / (clip_da.sel(band = 4) + clip_da.sel(band = 1)))
            ### safer, just because it keeps crashing after an hour
            nir = clip_da.sel(band=4).astype("float32")
            red = clip_da.sel(band=1).astype("float32")
            den = nir + red
            ndvi_da = (nir - red) / den.where(den != 0)

            ### Accumulate result
            tile_das.append(ndvi_da)

        ### Merge data
        scene_da = rxrmerge.merge_arrays(tile_das)

        ### Mask vegetation
        veg_mask = (scene_da > .3)

        ### Calculate statistics and save data to file
        ### count all valid pixels in the tract
        # total_pixels = scene_da.notnull().sum()
        total_pixels = int(scene_da.notnull().sum().values)

        ### count all vegetated pixels
        # veg_pixels = veg_mask.sum()
        veg_pixels = int(veg_mask.sum().values)
        
        ### identify vegetation patches
        # labeled_patches, num_patches = label(veg_mask)
        veg_arr = veg_mask.values.astype(bool)
        labeled_patches, num_patches = label(veg_arr)

        ### count patch pixels
        patch_sizes = np.bincount(labeled_patches.ravel())[1:]

        ### mean patch size
        # mean_patch_size = patch_sizes.mean()
        mean_patch_size = patch_sizes.mean() if len(patch_sizes) else np.nan

        ### Calculate edge density
        ### define the kernel
        kernel = np.array([
            [1, 1, 1], 
            [1, -8, 1], 
            [1, 1, 1]])
        
        ### detect boundaries between veg and non-veg
        # edges = convolve(veg_mask, kernel, mode='constant')
        
        ### calculate proportion of edge pixels by tract area
        # edge_density = np.sum(edges != 0) / veg_mask.size
        edges = convolve(veg_arr.astype(np.int8), kernel, mode="constant")
        edge_density = np.sum(edges != 0) / veg_arr.size

        ### Add a row to the statistics file for this tract
        ### create new datafram for the tract and append it to the csv
        pd.DataFrame(dict(

            ### store tract ID
            tract = [tract],

            ### store total pixels
            total_pixels = [int(total_pixels)],

            ### fraction of vegetated pixels 
            # frac_veg = [float(veg_pixels / total_pixels)],
            frac_veg = [float(veg_pixels / total_pixels) if total_pixels else np.nan],

            ### store mean patch size
            mean_patch_size = [mean_patch_size],

            ### edge density
            edge_density = [edge_density]

        ### write out the csv 
        )).to_csv(
            ndvi_stats_path_veg,

            ### append mode
            mode = 'a',

            ### get rid of row numbers
            index = False,

            ### write column names if the csv doesn't exist 
            header = (not os.path.exists(ndvi_stats_path_veg))
        )

### Re-load results from file

  0%|          | 0/125 [00:00<?, ?it/s]

### Understand warnings - no warnings in Portland
# ndvi_df = pd.read_csv(ndvi_stats_path_veg)
# ndvi_df[['mean_patch_size', 'frac_veg']].isna().sum()

### If needed, I am just going to park this here
### Since there is just one census tract with no vegetation patches, two choices, remove or reset from NaN, this is reset
# ndvi_df['mean_patch_size'] = ndvi_df['mean_patch_size'].fillna(0)

### check the file
ndvi_stats_df = pd.read_csv(ndvi_stats_path_veg)
ndvi_stats_df

### why do you have 1499 rows and I have 788 rows
# len(tract_cdc_gdf)
# tract_cdc_gdf['tract2010'].nunique()

### compare
# scene_df['tract'].nunique()

### Correct datatype mismatch
tract_cdc_gdf["tract2010"] = tract_cdc_gdf["tract2010"].astype(str)
ndvi_stats_df["tract"] = ndvi_stats_df["tract"].astype(str)

### Merge census data with geometry
por_ndvi_cdc_gdf = (

    ### grab census tract gdf
    tract_cdc_gdf

    ### merge with ndvi df
    .merge(
        ndvi_stats_df,

        ### match on the tract ID
        ### left:  tract2010
        ### right:  tract
        left_on = 'tract2010',
        right_on = 'tract',
        how = 'inner'
    )
)

### check
por_ndvi_cdc_gdf

### Plot chloropleths with vegetation statistics

### make a plot_cloropleth function
def plot_chloropleth(gdf, title, **opts):

    ### docstring to describe the function
    """Generate a cloropleth with the given color column"""

    ### use geoviews to make a polygon object
    return gv.Polygons(

        ### use mercator
        gdf.to_crs(ccrs.Mercator()),

        ### set plot crs to mercator
        crs = ccrs.Mercator()

    ### drop x and y axis and add a legend
    ).opts(xaxis = None, yaxis = None, colorbar = True, title=title, **opts)

### apply the function to Portland data
(
    plot_chloropleth(

        ### plot asthma by census tract
        por_ndvi_cdc_gdf, color = 'asthma', cmap = 'viridis', title='Asthma Prevalence', colorbar_opts={
            'title': 'Asthma prevalence (%)'
        })
    +

    ### add a second plot with vegetation edge density
    plot_chloropleth(
        por_ndvi_cdc_gdf, color = 'edge_density', cmap = 'Greens', title='Vegetation Edge Density', colorbar_opts={
            'title': 'Vegetation edge density'
        })
    )

### generate figure for portfolio
combo_map = (
    plot_chloropleth(
        por_ndvi_cdc_gdf,
        color="asthma",
        cmap="viridis",
        title="Asthma Prevalence",
        colorbar_opts={"title": "Asthma prevalence (%)"},
    )
    +
    plot_chloropleth(
        por_ndvi_cdc_gdf,
        color="edge_density",
        cmap="Greens",
        title="Vegetation Edge Density",
        colorbar_opts={"title": "Vegetation edge density"},
    )
).opts(width=500, height=250)

combo_map

hv.save(combo_map, "portland_asthma_edge_density.html")

### missing values
por_ndvi_cdc_gdf.isna().any()

place2010          False
tract2010          False
ST                 False
PlaceName          False
plctract10         False
PlcTrPop10         False
geometry           False
year               False
tract_x            False
asthma             False
asthma_ci_low      False
asthma_ci_high     False
data_value_unit    False
totalpopulation    False
totalpop18plus     False
tract_y            False
total_pixels       False
frac_veg           False
mean_patch_size    False
edge_density       False
dtype: bool

### Visualize distribution of data
### select variables
variables = ['frac_veg', 'asthma', 'mean_patch_size', 'edge_density']

### scatterplot
hvplot.scatter_matrix(

    por_ndvi_cdc_gdf
    [variables]
)

### histogram
### make facet
fig, axes = plt.subplots(2, 2, figsize = (12, 10))

### list veriables to plot
variables = ['frac_veg', 'asthma', 'mean_patch_size', 'edge_density']
titles = ['Vegetated fraction', 'Adult asthma rate', 'Mean patch size', 'Edge density']

### loop through variables and axes to plot histograms
for i, (var, title) in enumerate(zip(variables, titles)):
    ax = axes[i//2, i%2]
    ax.hist(por_ndvi_cdc_gdf[var], bins = 10, color = 'gray', edgecolor = 'black')
    ax.set_title(title)
    ax.set_xlabel(var)
    ax.set_ylabel('Frequency')

### adjust layers to prevent overlap
plt.tight_layout()
plt.show()

### drop rows with missing observations
model_df = (
    por_ndvi_cdc_gdf

    ### make a copy
    .copy()

    ### subset to columns
    [['frac_veg', 'asthma', 'mean_patch_size', 'edge_density', 'geometry']]

    ### drop rows (observations) with missing values
    .dropna()
)

model_df

### Perform variable transformation
### log of asthma rate
# model_df['log_asthma'] = np.log(model_df.asthma)

### log of patch size
model_df['log_patch'] = np.log(model_df.mean_patch_size)

model_df

### Visualize transformed variables
hvplot.scatter_matrix(
    model_df
    [[
        'frac_veg',
        'edge_density',
        'log_patch',
        'asthma'
    ]]
)

### q-q plots
### set variables
var_qq = ['frac_veg', 'edge_density', 'log_patch', 'asthma']

### make q-q plot for each variables
plt.figure(figsize = (12, 10))
for i, var in enumerate(var_qq, 1):

    ### make 2x2 facet
    plt.subplot(2, 2, i)

    ### norm distribution
    stats.probplot(model_df[var], dist = "norm", plot = plt)

    ### add title
    plt.title(f'Q-Q plot for {var}')

### plot
plt.tight_layout()
plt.show()

### Select predictor and outcome variables
X = model_df[['edge_density', 'log_patch']]
y = model_df[['asthma']]

### Split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(

    X, y, test_size = 0.33, random_state = 19
)

### Fit a linear regression
reg = LinearRegression()

### fit it to our data
reg.fit(X_train, y_train)

### Predict asthma values for the test dataset
y_test['pred_asthma'] = np.exp(reg.predict(X_test))

### real asthma rate for comparison
y_test['asthma'] = np.exp(y_test.asthma)

y_test

### Plot measured vs. predicted asthma prevalence with a 1-to-1 line
y_max = y_test.asthma.max()

(
    y_test
    .hvplot.scatter(x='asthma', 
                    y='pred_asthma',
                    xlabel = "Measured adult asthma prevalence",
                    ylabel = "Predicted adult asthma prevalence",
                    title = "Linear regression performance - testing data")
    .opts(aspect='equal', 
          xlim=(0, y_max), ylim=(0, y_max), 
          width=600, height=600)
) * hv.Slope(slope=1, y_intercept=0).opts(color='black')

### Compute model error for all census tracts
### use the trained model to predict asthma prevalence for each census tract
model_df['pred_asthma'] = np.exp(reg.predict(X))

### calculate model error
model_df['error_asthma'] = model_df['pred_asthma'] - model_df['asthma']

### Plot error geographically as a chloropleth
(
    plot_chloropleth(model_df, color = 'error_asthma', title = 'Model error (predicted - measured)', cmap = 'RdBu', colorbar_opts={
            'title': 'Asthma prevalence error (%)'
        })

    ### set frame width and aspect ratio
    .opts(frame_width = 600, aspect = 'equal')
)

error_map = (
    plot_chloropleth(
        model_df,
        color="error_asthma",
        title="Model error (predicted - measured)",
        cmap="RdBu",
        colorbar_opts={"title": "Asthma prevalence error (%)"},
    )
    .opts(width=500, height=400)
)

error_map

hv.save(error_map, "portland_model_error.html")

	year	tract	asthma	asthma_ci_low	asthma_ci_high	data_value_unit	totalpopulation	totalpop18plus
0	2023	41051005202	11.0	9.8	12.3	%	2937	2792
1	2023	41051002002	12.1	10.8	13.5	%	3692	3465
2	2023	41051003803	11.5	10.3	12.9	%	4385	3655
3	2023	41051001602	11.6	10.3	12.9	%	4616	3786
4	2023	41051006300	11.7	10.5	13.2	%	5616	4694
...	...	...	...	...	...	...	...	...
192	2023	41051009202	13.0	11.6	14.4	%	5459	4135
193	2023	41051002402	11.3	10.2	12.6	%	3830	3560
194	2023	41051008203	12.6	11.2	13.9	%	5608	4637
195	2023	41051004300	11.2	10.0	12.4	%	1176	953
196	2023	41051008800	12.2	10.9	13.6	%	4204	3462

	place2010	tract2010	ST	PlaceName	plctract10	PlcTrPop10	geometry	year	tract	asthma	asthma_ci_low	asthma_ci_high	data_value_unit	totalpopulation	totalpop18plus
0	4159000	41051000301	41	Portland	4159000-41051000301	5041	POLYGON ((-13650305.147 5699042.371, -13650193...	2023	41051000301	12.1	10.8	13.5	%	5443	4908
1	4159000	41051000302	41	Portland	4159000-41051000302	6709	POLYGON ((-13650063.25 5697279.135, -13649943....	2023	41051000302	11.3	10.1	12.5	%	7191	5476
2	4159000	41051000401	41	Portland	4159000-41051000401	3418	POLYGON ((-13648772.389 5699042.688, -13648767...	2023	41051000401	11.7	10.4	13.0	%	3746	3023
3	4159000	41051000402	41	Portland	4159000-41051000402	3477	POLYGON ((-13647657.747 5699037.289, -13647659...	2023	41051000402	11.6	10.3	12.9	%	3906	3188
4	4159000	41051000501	41	Portland	4159000-41051000501	3732	POLYGON ((-13646585.183 5696754.598, -13646694...	2023	41051000501	11.9	10.7	13.3	%	4134	3307
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
120	4159000	41051009804	41	Portland	4159000-41051009804	3183	POLYGON ((-13634931.924 5701311.194, -13634931...	2023	41051009804	13.1	11.8	14.6	%	3496	2786
121	4159000	41051009903	41	Portland	4159000-41051009903	391	POLYGON ((-13634999.94 5697453.141, -13635002....	2023	41051009903	11.6	10.4	13.0	%	7482	5839
122	4159000	41051010200	41	Portland	4159000-41051010200	583	MULTIPOLYGON (((-13633882.74 5709928.305, -136...	2023	41051010200	12.1	10.9	13.5	%	7130	5680
123	4159000	41051980000	41	Portland	4159000-41051980000	8	POLYGON ((-13660405.502 5712550.235, -13660339...	2023	41051980000	11.7	10.4	13.2	%	55	53
124	4159000	41051009605	41	Portland	4159000-41051009605	0	POLYGON ((-13636161.226 5707323.831, -13636162...	2023	41051009605	12.6	11.4	14.1	%	5710	4409

	tract	total_pixels	frac_veg	mean_patch_size	edge_density
0	41051000301	24731	0.313412	6.658935	0.521101
1	41051000302	32476	0.400819	7.140428	0.767046
2	41051000401	13692	0.280456	3.626062	0.771442
3	41051000402	11869	0.234055	2.930380	0.760752
4	41051000501	11419	0.210439	2.508351	0.681798
...	...	...	...	...	...
120	41051009803	3337	0.202577	3.634409	0.336567
121	41051009804	11798	0.292507	4.733882	0.732260
122	41051009903	2449	0.246223	2.512500	0.350452
123	41051010200	301533	0.039293	11.414258	0.035563
124	41051980000	89236	0.042808	3.669549	0.071589

	place2010	tract2010	ST	PlaceName	plctract10	PlcTrPop10	geometry	year	tract_x	asthma	asthma_ci_low	asthma_ci_high	data_value_unit	totalpopulation	totalpop18plus	tract_y	total_pixels	frac_veg	mean_patch_size	edge_density
0	4159000	41051000301	41	Portland	4159000-41051000301	5041	POLYGON ((-13650305.147 5699042.371, -13650193...	2023	41051000301	12.1	10.8	13.5	%	5443	4908	41051000301	24731	0.313412	6.658935	0.521101
1	4159000	41051000302	41	Portland	4159000-41051000302	6709	POLYGON ((-13650063.25 5697279.135, -13649943....	2023	41051000302	11.3	10.1	12.5	%	7191	5476	41051000302	32476	0.400819	7.140428	0.767046
2	4159000	41051000401	41	Portland	4159000-41051000401	3418	POLYGON ((-13648772.389 5699042.688, -13648767...	2023	41051000401	11.7	10.4	13.0	%	3746	3023	41051000401	13692	0.280456	3.626062	0.771442
3	4159000	41051000402	41	Portland	4159000-41051000402	3477	POLYGON ((-13647657.747 5699037.289, -13647659...	2023	41051000402	11.6	10.3	12.9	%	3906	3188	41051000402	11869	0.234055	2.930380	0.760752
4	4159000	41051000501	41	Portland	4159000-41051000501	3732	POLYGON ((-13646585.183 5696754.598, -13646694...	2023	41051000501	11.9	10.7	13.3	%	4134	3307	41051000501	11419	0.210439	2.508351	0.681798
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
120	4159000	41051009804	41	Portland	4159000-41051009804	3183	POLYGON ((-13634931.924 5701311.194, -13634931...	2023	41051009804	13.1	11.8	14.6	%	3496	2786	41051009804	11798	0.292507	4.733882	0.732260
121	4159000	41051009903	41	Portland	4159000-41051009903	391	POLYGON ((-13634999.94 5697453.141, -13635002....	2023	41051009903	11.6	10.4	13.0	%	7482	5839	41051009903	2449	0.246223	2.512500	0.350452
122	4159000	41051010200	41	Portland	4159000-41051010200	583	MULTIPOLYGON (((-13633882.74 5709928.305, -136...	2023	41051010200	12.1	10.9	13.5	%	7130	5680	41051010200	301533	0.039293	11.414258	0.035563
123	4159000	41051980000	41	Portland	4159000-41051980000	8	POLYGON ((-13660405.502 5712550.235, -13660339...	2023	41051980000	11.7	10.4	13.2	%	55	53	41051980000	89236	0.042808	3.669549	0.071589
124	4159000	41051009605	41	Portland	4159000-41051009605	0	POLYGON ((-13636161.226 5707323.831, -13636162...	2023	41051009605	12.6	11.4	14.1	%	5710	4409	41051009605	116	0.215517	4.166667	0.741667

	frac_veg	asthma	mean_patch_size	edge_density	geometry
0	0.313412	12.1	6.658935	0.521101	POLYGON ((-13650305.147 5699042.371, -13650193...
1	0.400819	11.3	7.140428	0.767046	POLYGON ((-13650063.25 5697279.135, -13649943....
2	0.280456	11.7	3.626062	0.771442	POLYGON ((-13648772.389 5699042.688, -13648767...
3	0.234055	11.6	2.930380	0.760752	POLYGON ((-13647657.747 5699037.289, -13647659...
4	0.210439	11.9	2.508351	0.681798	POLYGON ((-13646585.183 5696754.598, -13646694...
...	...	...	...	...	...
120	0.292507	13.1	4.733882	0.732260	POLYGON ((-13634931.924 5701311.194, -13634931...
121	0.246223	11.6	2.512500	0.350452	POLYGON ((-13634999.94 5697453.141, -13635002....
122	0.039293	12.1	11.414258	0.035563	MULTIPOLYGON (((-13633882.74 5709928.305, -136...
123	0.042808	11.7	3.669549	0.071589	POLYGON ((-13660405.502 5712550.235, -13660339...
124	0.215517	12.6	4.166667	0.741667	POLYGON ((-13636161.226 5707323.831, -13636162...

Urban Greenspace and Asthma Prevalence¶

Get Started with Big Data Pipelines¶

Read More¶

Working with larger-than-memory (big) data¶

Check your work with testing!¶

STEP 1: Set up your analysis¶

STEP 2: Create a site map¶

STEP 3 - Access Asthma and Urban Greenspaces Data¶

Step 3a - Access human health data¶

Step 3b - Join health data with census tract boundaries¶

Step 3c - Get data URLs for urban greenspace imagery¶

Step 3d- Compute NDVI Statistics¶

Repeat for all tracts¶

STEP 4 - Explore your data with plots¶

Chloropleth plots¶

STEP 5: Explore a linear ordinary least-squares regression¶

Model description¶

Step 5a - Data preparation¶

Step 5b - Fit and Predict¶

Step 5c - Explore spatial bias¶

	tract	date	rgbir_href
0	41051000301	2022-07-11	https://naipeuwest.blob.core.windows.net/naip/...
1	41051000301	2022-07-11	https://naipeuwest.blob.core.windows.net/naip/...
2	41051000302	2022-07-11	https://naipeuwest.blob.core.windows.net/naip/...
3	41051000302	2022-07-11	https://naipeuwest.blob.core.windows.net/naip/...
4	41051000401	2022-07-11	https://naipeuwest.blob.core.windows.net/naip/...
...	...	...	...
240	41051010200	2022-07-11	https://naipeuwest.blob.core.windows.net/naip/...
241	41051980000	2022-07-11	https://naipeuwest.blob.core.windows.net/naip/...
242	41051980000	2022-07-11	https://naipeuwest.blob.core.windows.net/naip/...
243	41051980000	2022-07-11	https://naipeuwest.blob.core.windows.net/naip/...
244	41051009605	2022-07-11	https://naipeuwest.blob.core.windows.net/naip/...

	asthma	pred_asthma
27	73130.441833	121694.493606
99	198789.151143	136349.314210
36	98715.771011	173982.656275
2	120571.714986	141342.789221
77	219695.988672	116207.570247
105	162754.791419	120875.041480
124	296558.565298	135835.195944
118	198789.151143	146596.941060
78	89321.723361	97364.751829
16	98715.771011	172366.801852
30	80821.637540	106924.654909
95	162754.791419	182767.267020
122	179871.862254	121281.584296
18	109097.799277	144166.671629
113	400312.191330	122340.176642
8	147266.625241	145617.378311
65	44355.855130	84421.370313
84	98715.771011	109042.513801
13	219695.988672	149694.298701
61	162754.791419	166967.736950
83	54176.363797	96569.709963
46	296558.565298	161301.947040
37	98715.771011	152756.240989
104	268337.286521	123771.684495
70	49020.801136	92049.909520
123	120571.714986	180650.774690
111	242801.617498	100281.438743
14	147266.625241	167276.009377
76	54176.363797	81184.424663
28	147266.625241	157831.676096
20	120571.714986	152386.280398
101	133252.352946	164648.997001
114	179871.862254	108759.464545
47	242801.617498	173630.513186
119	442413.392009	164928.597895
102	120571.714986	147499.095097
107	147266.625241	151698.899208
29	198789.151143	192928.136665
82	198789.151143	120207.897561
6	133252.352946	151091.099066
64	89321.723361	168853.342763
90	296558.565298	116079.620834