# Read the daily observed time series of sea ice area in the Arctic for whole period # Calculate moving average # Treat outliers # Interpolate NaN # Convert sea ice data into CSV file # # # Script version 2.0.0 # # Script works with: # Python version 3.10.13 # # Package version # numpy: 1.26.3 # pandas: 2.2.0 # # Ver1.0.0: Created by Mariko Koseki, 15.11.2023 # Ver1.1.0: Updated by Mariko, 05.02.2024 # Ver1.1.1: Updated by Mariko, 13.03.2024 # Ver2.0.0: Updated by Mariko, 07.06.2024 import numpy as np import pandas as pd import datetime import calendar import math import matplotlib.pyplot as plt import matplotlib.dates as mdates import matplotlib.ticker as ptick import os import glob import sys import platform # Ver1.1.0 print('--- Python version ---') print(platform.python_version()) print('--- Package version ---') print('numpy: ', np.__version__) print('pandas: ', pd.__version__) ##--- Input -----------------------------------------## #ver2.0.0 args = sys.argv if len(args) == 2: year_start = args[1] if len(year_start) == 4: yyyy1 = int(year_start[0:4]) yyyy2 = yyyy1 else: print('---How to use this script---') print('python read_obs_ice_outlier.py ') print('or') print('python read_obs_ice_outlier.py ') print(' = start year; = end year') print('') print('Example: python read_obs_ice_outlier.py 2023') print('Example: python read_obs_ice_outlier.py 1995 2023') print('') sys.exit() elif len(args) == 3: year_start = args[1] year_end = args[2] if (len(year_start) == 4) & (len(year_end) == 4): yyyy1 = int(year_start[0:4]) yyyy2 = int(year_end[0:4]) else: print('') print('---How to use this script---') print('python read_obs_ice_outlier.py ') print('or') print('python read_obs_ice_outlier.py ') print(' = start year; = end year') print('') print('Example: python read_obs_ice_outlier.py 2023') print('Example: python read_obs_ice_outlier.py 1995 2023') print('') sys.exit() else: print('') print('---How to use this script---') print('python read_obs_ice_outlier.py ') print('or') print('python read_obs_ice_outlier.py ') print(' = start year; = end year') print('') print('Example: python read_obs_ice_outlier.py 2023') print('Example: python read_obs_ice_outlier.py 1995 2023') print('') sys.exit() ##--- Functions: Data treatment ---------------------------------------## ### Calculate moving average and difference from original data ### #ver2.0.0 def delta_mov_ave(input_df1, window_size): ''' Calculate moving average ''' df1 = input_df1.copy() window = window_size w = np.ones(window)/window ### AREA ### original1 = df1['AREA'] mov_ave1 = np.convolve(original1, w, mode='same') n_conv1 = math.ceil(window/2) mov_ave1[0] *= window/n_conv1 for i in range(1, n_conv1): mov_ave1[i] *= window/(i+n_conv1) mov_ave1[-i] *= window/(i + n_conv1 - (window % 2)) delta_mov_ave1 = abs(original1 - mov_ave1) df1['Moving_average_AREA'] = mov_ave1 df1['Delta_AREA'] = delta_mov_ave1 ### EXTENT ### original2 = df1['EXTENT'] mov_ave2 = np.convolve(original2, w, mode='same') n_conv2 = math.ceil(window/2) mov_ave2[0] *= window/n_conv2 for i in range(1, n_conv2): mov_ave2[i] *= window/(i+n_conv2) mov_ave2[-i] *= window/(i + n_conv2 - (window % 2)) delta_mov_ave2 = abs(original2 - mov_ave2) df1['Moving_average_EXTENT'] = mov_ave2 df1['Delta_EXTENT'] = delta_mov_ave2 return df1 ### Detect outliers ### #ver2.0.0 def detect_outlier1(input_df2): df2 = input_df2.copy() df2_len = len(df2) ### AREA ### threshold1 = (df2['Delta_AREA'].mean()*5) #print(threshold) #df2['Outlier'] = df2.loc[10 : df2_len - 10, 'AREA'][df2.loc[10 : df2_len - 10, 'Delta_AREA'] > threshold] # not work df2['Outlier_AREA'] = df2.iloc[10 : df2_len - 10, 3][df2.iloc[10 : df2_len - 10, 10] > threshold1] outliers_num1 = df2['Outlier_AREA'].notna().sum() #print(outliers_num1) ### EXTENT ### threshold2 = (df2['Delta_EXTENT'].mean()*5) df2['Outlier_EXTENT'] = df2.iloc[10 : df2_len - 10, 4][df2.iloc[10 : df2_len - 10, 12] > threshold1] outliers_num2 = df2['Outlier_EXTENT'].notna().sum() #print(outliers_num2) return df2 def detect_outlier2(input_df2, threshold): #ver2.0.0 df2 = input_df2.copy() ### AREA ### df2['Outlier_AREA'] = df2['AREA'][df2['Delta_AREA'] > threshold] outliers_num1 = df2['Outlier_AREA'].notna().sum() #print(outliers_num1) ### EXTENT ### df2['Outlier_EXTENT'] = df2['EXTENT'][df2['Delta_EXTENT'] > threshold] outliers_num2 = df2['Outlier_EXTENT'].notna().sum() #print(outliers_num2) return df2 ### Plot outliers ### #ver2.0.0 def plot_outlier(input_df3): df3 = input_df3.copy() #year_obs = df3.loc[0, 'YEAR'] #not work year_obs = df3.iloc[0, 0] time_obs = df3['DATE'] ### AREA ### area = df3['AREA'] mov_ave_area = df3['Moving_average_AREA'] outlier_area = df3['Outlier_AREA'] outliers_num_area = df3['Outlier_AREA'].notna().sum() fig1 = plt.figure(figsize=(8,5)) ax1 = fig1.add_subplot(111) ax1.plot(time_obs, area, linewidth = 1, label = 'Sea ice area') ax1.plot(time_obs, mov_ave_area, label='Moving average') ax1.scatter(time_obs, outlier_area, label = f'Outliers={outliers_num_area}', color='r', marker='*') # Outlier ax1.legend() ## Save figures ## #fig_name1 = 'sea_ice_area_outlier_' + str(year_obs) + '.png' #fig1.savefig(fig_name1, dpi=600, format='png') plt.show() ### EXTENT ### extent = df3['EXTENT'] mov_ave_extent = df3['Moving_average_EXTENT'] outlier_extent = df3['Outlier_EXTENT'] outliers_num_extent = df3['Outlier_EXTENT'].notna().sum() fig2 = plt.figure(figsize=(8,5)) ax2 = fig2.add_subplot(111) ax2.plot(time_obs, extent, linewidth = 1, label = 'Sea ice extent') ax2.plot(time_obs, mov_ave_extent, label='Moving average') ax2.scatter(time_obs, outlier_extent, label = f'Outliers={outliers_num_extent}', color='r', marker='*') # Outlier ax2.legend() ## Save figures ## #fig_name2 = 'sea_ice_extent_outlier_' + str(year_obs) + '.png' #fig2.savefig(fig_name2, dpi=600, format='png') plt.show() ### Handle NaN ### def fill_nan(input_df4): df4 = input_df4.copy() #yyyy = df4.loc[0, 'YEAR'] #not work yyyy = df4.iloc[0, 0] dt_now = datetime.datetime.now() year_now = dt_now.year #print(year_now) #print(yyyy) start_date = datetime.datetime(yyyy, 1, 1) end_date = datetime.datetime(yyyy, 12, 31) if yyyy == 2002: all_date = pd.DataFrame(pd.date_range(start = df4["DATE"].min(), end = end_date, freq="D"), columns=["DATE"]) elif yyyy == year_now: all_date = pd.DataFrame(pd.date_range(start = start_date, end = df4["DATE"].max(), freq="D"), columns=["DATE"]) else: all_date = pd.DataFrame(pd.date_range(start = start_date, end = end_date, freq="D"), columns=["DATE"]) #print(all_date) df_fill = pd.merge(all_date, df4, on = 'DATE', how = 'left') ### AREA ### df_fill['AREA_no_Outlier'] = df_fill['AREA'][df_fill['Outlier_AREA'].isna()] #print(df_fill.head(60)) #print(df_fill) ### EXTENT ### df_fill['EXTENT_no_Outlier'] = df_fill['EXTENT'][df_fill['Outlier_EXTENT'].isna()] ### Remove leap day (29/02) ### print('') print('check if ', yyyy, ' is a leap year:') print(calendar.isleap(yyyy)) print('') if calendar.isleap(yyyy): print(yyyy, ' is a leap year') leap_day = datetime.datetime(yyyy, 2, 29) drop_ind = df_fill.index[df_fill['DATE'] == leap_day] df_drop = df_fill.drop(drop_ind) df_fill2 = df_drop.reset_index(drop=True) else: df_fill2 = df_fill.copy() #print(df_fill2) return df_fill2 ### Interpolate missing values ### def inter_df(input_df5): ''' Linear interpolation The same value is repeated if there is NaN at the top/bottom of DataFrame ''' df5 = input_df5.copy() ### AREA ### df5['AREA_Interpolated'] = df5['AREA_no_Outlier'].interpolate(limit_direction='both', method='linear') ### EXTENT ### df5['EXTENT_Interpolated'] = df5['EXTENT_no_Outlier'].interpolate(limit_direction='both', method='linear') #print(df5) return df5 ### Plot interpolated sea ice area ### def plot_seaice(input_df6): df6 = input_df6.copy() yyyy = df6['DATE'].dt.year year_obs = yyyy[0] #print(year_obs) time_obs = df6['DATE'] ### AREA ### area = df6['AREA'] mov_ave_area = df6['Moving_average_AREA'] outlier_area = df6['Outlier_AREA'] area_inter = df6['AREA_Interpolated'] outliers_num_area = df6['Outlier_AREA'].notna().sum() fig3 = plt.figure(figsize=(8,5)) ax3 = fig3.add_subplot(111) #ax3.plot(time_obs, mov_ave_area, label='Moving average') ax3.plot(time_obs, area_inter, linewidth = 4.7, label= 'Sea ice area (interpolated)', alpha=0.7) ax3.plot(time_obs, area, linewidth = 1.3, label = 'Sea ice area', color='k') ax3.scatter(time_obs, outlier_area, label = f'Outliers={outliers_num_area}', color='r', marker='*', s=70) # Outlier ax3.legend() ## Save figures ## #fig_name3 = 'sea_ice_area_interpolated_' + str(year_obs) + '.png' #fig3.savefig(fig_name3, dpi=600, format='png') plt.show() ### EXTENT ### extent = df6['EXTENT'] mov_ave_extent = df6['Moving_average_EXTENT'] outlier_extent = df6['Outlier_EXTENT'] extent_inter = df6['EXTENT_Interpolated'] outliers_num_extent = df6['Outlier_EXTENT'].notna().sum() fig4 = plt.figure(figsize=(8,5)) ax4 = fig4.add_subplot(111) #ax4.plot(time_obs, mov_ave_extent, label='Moving average') ax4.plot(time_obs, extent_inter, linewidth = 4.7, label= 'Sea ice extent (interpolated)', alpha=0.7) ax4.plot(time_obs, extent, linewidth = 1.3, label = 'Sea ice extent', color='k') ax4.scatter(time_obs, outlier_extent, label = f'Outliers={outliers_num_extent}', color='r', marker='*', s=70) # Outlier ax4.legend() ## Save figures ## #fig_name4 = 'sea_ice_extent_interpolated_' + str(year_obs) + '.png' #fig4.savefig(fig_name4, dpi=600, format='png') plt.show() ##--- Create list of year ------------------------------------------------------## #ver2.0.0 yyyy_int_list = [] for yyyy in range(yyyy1, yyyy2+1): yyyy_int_list.append(yyyy) print(yyyy_int_list) ##--- Set path 2 ----------------------------## ## Set path to input file ## # Ver1.1.0 ''' Set 'out_obs' (path to input csv file) ''' seaice_dir = '/nird/projects/NS9873K/norcpm/validation/seaicepre/' # Ver1.1.1 out_obs = seaice_dir + 'obs/' ## Set path to output file ## # Ver1.1.0 ''' Set 'out_obs_post' (path to output csv file) ''' out_obs_post = seaice_dir + 'obs_post/' if not os.path.exists(out_obs_post): os.makedirs(out_obs_post) for YEAR in yyyy_int_list: #ver2.0.0 print('') print('Year: ', YEAR) print('') file_obs = out_obs + 'sea_ice_obs_' + str(YEAR) + '.csv' #ver2.0.0 print('file name: ', file_obs) print('') ##--- Read CSV files into pandas.DataFrame ------------------------------## #ver2.0.0 df = pd.read_csv(file_obs, index_col=0) print(df) ## Convert 'object' to 'datetime' ## #ver2.0.0 df['DATE'] = pd.to_datetime(df['YEAR'].astype(str) + '-' + df['MMDD'].astype(str)) print(df) ##--- Call Functions for data treatment ---------------------------## #ver2.0.0 ## Detect outliers ## if YEAR == 2011: df_post = detect_outlier2(delta_mov_ave(df, 30), 1000000) elif YEAR == 2002: df_post = detect_outlier1(delta_mov_ave(df, 10)) else: df_post = detect_outlier1(delta_mov_ave(df, 30)) print(df_post) ## Plot outliers ## plot_outlier(df_post) ## Handle NaN ## df_fill_nan = fill_nan(df_post) ## Interpolate missing values ## df_inter = inter_df(df_fill_nan) print(df_inter) ## Plot interpolated sea ice area ## plot_seaice(df_inter) ##--- Save DataFrame as CSV --------------------------------------## #ver2.0.0 df_inter.to_csv(out_obs_post + 'sea_ice_obs_post_' + str(YEAR) + '.csv') print('') print('') print('Saved sea ice area/extent!') print('') print('-----------------------') print('') print("completed!")