Commit ffd9540b authored by itzel Ruvalcaba's avatar itzel Ruvalcaba
Browse files

starting git tool for shark data

parent a4432047
# ignore all pyc
__pycache__/
*.pyc
Python3 tool to dowlad/update sharkweb data for biogeochemicstry
includes:
NO2, NO3, NH4, PO4, Si, O2, NO2+NH3, total N, total P
Temperature and Salinity
data are from CTD and/or from bottle (BTL)
Additional variables are for DOC, yellow substances, Alkaliniy, pH
#====================
TOOL contains:
sharkdata.py = script to get data online (url hardcoded there) for chosen years
read_shark_data.py = reads data for chosen stations, excludes unused variables
(can be easily added if needed),
renames varaible to a more freindly coding name,
plots and saves timeseries for some nutrients
shark_to_betcdf.py = main program, where user chooses years(s),
name of station (as written in webpage) and
path whre nc. files will be stored.
#====================
authors:
started by Magnus Wenzer a001985
modified by Itzel Ruvalcaba a 002340 to be compatible with python3 12/21
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
import json
#import urllib2
import urllib.request as urllib2
import codecs
import os
import pandas as pd
import sharkdata
import datetime
import xarray as xr
import matplotlib.pyplot as plt
def shark_station_to_netcdf(stationnames, from_year, to_year, outdir):
sharkdata_1=sharkdata.SharkData()
#print(sharkdata_1)
#stationname='SLÄGGÖ' #.decode('utf-8')
datatype = 'PhysicalChemical'
#datasetnames=sharkdata_1.list_all_datasets_for_year(from_year, to_year, datatype)
# IRB: test (datasetnames empty for me, no logical years?)
datasetnames=sharkdata_1.list_all_datasets_for_year(from_year, to_year, datatype)
#print(datasetnames)
#breakpoint()
ndata={}
frames={}
print(stationnames)
for stationname in stationnames:
ndata[stationname]=0
frames[stationname]=[]
#datasetnames=['SHARK_PhysicalChemical_2017_BAS_SMHI']
#pick stations
for datasetname in datasetnames:
sharkdata_1.load_dataset(datasetname)
df=sharkdata_1.data[datasetname]
if df.shape[0]>0:
df.station_name=df.station_name.str.strip().str.upper()
for stationname in stationnames:
# IRB 12/2021: no need to decode from UTF-8 anymore for python3
#df2=df[df.station_name.str.contains(stationname.decode('utf-8'))]
df2=df[df.station_name.str.contains(stationname)]
ndata[stationname]+=df2.shape[0]
frames[stationname].append(df2)
#make xarray, pivot table, export to netcdf, and plot
for stationname in stationnames:
print(stationname) # IRB
df=pd.concat(frames[stationname])
if df.shape[0]>0:
df.sample_depth_m=df.sample_depth_m.astype('float')
df.value=df.value.astype('float')
df['date_time']=pd.to_datetime(df.visit_date+' '+df.sample_time)
df.drop_duplicates(subset=['date_time','sample_depth_m','parameter'],inplace=True)
values_pivoted=pd.pivot_table(df,
values='value',
index=['date_time','sample_depth_m'],
columns = ['parameter']
)
ds=xr.Dataset.from_dataframe(values_pivoted)
# IRB 12/2021: add dict-like var to change variable names, so more in agreement with previous shark data set
tlist = [i for i in ds.data_vars]
#print(tlist)
# IRB 12/2021: drop unused variables
#breakpoint()
if 'Coloured dissolved organic matter CDOM' in tlist:
drop = ['Coloured dissolved organic matter CDOM']
ds = ds.drop_vars(drop)
if 'Conductivity CTD' in tlist:
drop = ['Conductivity CTD']
ds = ds.drop_vars(drop)
if 'Pressure CTD' in tlist:
drop = ['Pressure CTD']
ds = ds.drop_vars(drop)
if 'Alkalinity_2' in tlist:
drop = ['Alkalinity_2']
ds = ds.drop_vars(drop)
if 'Humus' in tlist:
drop = ['Humus']
ds = ds.drop_vars(drop)
if 'Temperature pH Laboratory' in tlist:
drop = ['Temperature pH Laboratory']
ds = ds.drop_vars(drop)
if 'pH Laboratory' in tlist:
drop = ['pH Laboratory']
ds = ds.drop_vars(drop)
if 'Aluminium' in tlist:
drop = ['Aluminium']
ds = ds.drop_vars(drop)
if 'Current direction' in tlist:
drop = ['Current direction']
ds = ds.drop_vars(drop)
if 'Current velocity' in tlist:
drop = ['Current velocity']
ds = ds.drop_vars(drop)
if 'Urea' in tlist:
drop = ['Urea']
ds = ds.drop_vars(drop)
del tlist
# IRB 12/2021
# lefted here???????
# problem: not all contain disolved oxygen CTD...how to add an empty dimention in ds?
tlist = [i for i in ds.data_vars]
if 'Ammonium NH4-N' in tlist:
dic_newname = {'Ammonium NH4-N': 'Ammonium_NH4'}
da = ds.rename_vars(dic_newname)
else:
da = ds
if 'Chlorophyll-a bottle' in tlist:
dic_newname = {'Chlorophyll-a bottle': 'Chlorophyll_a_BTL'}
db = da.rename_vars(dic_newname)
del da
else:
db = da
del da
if 'Dissolved organic carbon DOC' in tlist:
dic_newname = {'Dissolved organic carbon DOC' : 'Dissolved_organic_carbon_DOC'}
dc = db.rename_vars(dic_newname)
del db
else:
dc = db
del db
if 'Dissolved oxygen O2 bottle' in tlist:
dic_newname = {'Dissolved oxygen O2 bottle' : 'Dissolved_oxygen_BTL'}
dd=dc.rename_vars(dic_newname)
del dc
else:
dd = dc
del dc
if 'Dissolved oxygen O2 CTD' in tlist:
dic_newname = {'Dissolved oxygen O2 CTD' : 'Dissolved_oxygen_CTD'}
de=dd.rename_vars(dic_newname)
del dd
else:
de = dd
del dd
if 'Nitrate NO3-N' in tlist:
dic_newname = {'Nitrate NO3-N' : 'Nitrate_NO3'}
df=de.rename_vars(dic_newname)
del de
else:
df = de
del de
if 'Nitrite NO2-N' in tlist:
dic_newname = {'Nitrite NO2-N' : 'Nitrate_NO2'}
dg=df.rename_vars(dic_newname)
del df
else:
dg = df
del df
if 'Nitrite+Nitrate NO2+NO3-N' in tlist:
dic_newname = {'Nitrite+Nitrate NO2+NO3-N' : 'Nitrite+Nitrate_NO2+NO3'}
dh=dg.rename_vars(dic_newname)
del dg
else:
dh = dg
del dg
if 'Phosphate PO4-P' in tlist:
dic_newname = {'Phosphate PO4-P' : 'Phosphate_PO4'}
di=dh.rename_vars(dic_newname)
del dh
else:
di = dh
del dh
if 'Salinity CTD' in tlist:
dic_newname = {'Salinity CTD' : 'Salinity_CTD'}
dj=di.rename_vars(dic_newname)
del di
else:
dj = di
del di
if 'Salinity bottle' in tlist:
dic_newname = {'Salinity bottle' : 'Salinity_BTL'}
dk=dj.rename_vars(dic_newname)
del dj
else:
dk = dj
del dj
if 'Secchi depth' in tlist:
dic_newname = {'Secchi depth' : 'Secchi_depth'}
dl=dk.rename_vars(dic_newname)
del dk
else:
dl = dk
del dk
if 'Silicate SiO3-Si' in tlist:
dic_newname = {'Silicate SiO3-Si' : 'Silicate_SiO3_Si'}
dm=dl.rename_vars(dic_newname)
del dl
else:
dm = dl
del dl
if 'Temperature CTD' in tlist:
dic_newname = {'Temperature CTD' : 'Temperature_CTD'}
dn=dm.rename_vars(dic_newname)
del dm
else:
dn = dm
del dm
if 'Temperature bottle' in tlist:
dic_newname = {'Temperature bottle' : 'Temperature_BTL'}
do=dn.rename_vars(dic_newname)
del dn
else:
do = dn
del dn
if 'Total Nitrogen Tot-N' in tlist:
dic_newname = {'Total Nitrogen Tot-N' : 'Total_N'}
dp=do.rename_vars(dic_newname)
del do
else:
dp = do
del do
if 'Total phosphorus Tot-P' in tlist:
dic_newname = {'Total phosphorus Tot-P' : 'Total_P'}
dq=dp.rename_vars(dic_newname)
del dp
else:
dq = dp
del dp
ds = dq
del dq
len(tlist)
tlist = [i for i in ds.data_vars]
print(tlist)
#dic_list = {tlist[0]: tlist[0], tlist[1]: "Ammonium_NH4", tlist[2]: "Chlorophyll_a_BTL",
# tlist[3]: "Dissolved_organic_carbon_DOC", tlist[4]:"Dissolved_oxygen_CTD", tlist[5]:"Dissolved_oxygen_BTL", tlist[6]: "Nitrate_NO3",
# tlist[7]: "Nitrite_NO2", tlist[8]: "Nitrite+Nitrate_NO2+NO3", tlist[9]: "Phosphate_PO4", tlist[10]: "Salinity_CTD",
# tlist[11]: "Salinity_BTL", tlist[12]: "Secchi_depth", tlist[13]: "Silicate_SiO3_Si",
# tlist[14]: "Temperature_CTD", tlist[15]: "Temperature_BTL", tlist[16]: "Total_N", tlist[17]: "Total_P",
# tlist[18]: tlist[18]}
#breakpoint()
#ds.rename_vars(dic_list)
# IRB 12/2021: changed to be compatible with python3 and more in agreement with previous shark dataset
#ds.to_netcdf(outdir+stationname.upper()+'_'+str(from_year)+'_'+str(to_year)+'.nc')
ds.to_netcdf('%s/shark_smhi_%s_%s-%s.nc' %(outdir,stationname.upper(),from_year,to_year))
fig = plt.figure(figsize=(12,12))
plt.subplot(4,1,1)
ds['Salinity_CTD'].plot.line(x='date_time',marker='o',label=[])
plt.subplot(4,1,2)
ds['Temperature_CTD'].plot.line(x='date_time',marker='o',label=[])
plt.subplot(4,1,3)
ds['Total_N'].plot.line(x='date_time',marker='o',label=[])
plt.subplot(4,1,4)
ds['Total_P'].plot.line(x='date_time',marker='o')
# plt.plot(values_pivoted[stationname].date_time,values_pivoted[station_name].'Salinity CTD')
# values_pivoted[stationname].plot()
#IRB 12/2021: change to be compatible with python3
#plt.savefig(outdir+stationname.upper()+'_'+str(from_year)+'_'+str(to_year)+'.png', dpi=600)
plt.savefig('%s/%s_%s-%s.png' %(outdir,stationname.upper(),from_year,to_year), dpi=600)
else:
print('Station: '+stationname+' not present in datasets')
return ds
#if __name__ == "__main__":
# to_year=2018
# from_year=1970
# outdir='/nobackup/fouo4/sm_larar/CoClime/sharkdata/'
# ds=shark_station_to_netcdf(['SLÄGGÖ','B1'],from_year,to_year,outdir)
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
#============================
#Created on 14 mars 2018
#@author: a001985
#modified by: itzel Ruvalcaba to make it work on python3 on 12/21
#============================
import read_shark_data
from_year=1961
to_year=2020
#outdir='/nobackup/smhid19/users/sm_ruvba/test_sharkdata'
outdir='/nobackup/smhid19/users/sm_ruvba/observations/shark_1961-2020'
# I will need to find out how to download all available data for Kattegat-Skagerak
# (not per stations, as they do not seem to have names, see map SMHI report 2018, but perhaps as spacial maps per year)!!!!!
# for now 29 stations
stations=['A5','A13','A15','A17','ANHOLT E','B1','B7','BCS III-10','BY1','BY2','BY4','BY5','BY10','BY15','BY20','BY29','BY31','BY32','BY38','C3','FLADEN','HANÖBUKTEN','N14 FALKENBERG','P2','SLÄGGÖ','SR5','Å17','M6','W LANDSKRONA']
#note: A5 = F3, A13 = F9,
ds=read_shark_data.shark_station_to_netcdf(stations,from_year,to_year,outdir)
#!/usr/bin/env python3
# -*- coding:utf-8 -*-
import json
#import urllib2
import urllib.request as urllib2
import codecs
import os
import pandas as pd
from netCDF4 import Dataset, num2date, date2num
class SharkData():
"""
Class to get data from sharkdata. Data is in row format and stored in pandas dataframes.
"""
def __init__(self):
self.sharkdata_url = u'http://sharkdata.se/datasets/'
self.datasets = json.load(urllib2.urlopen(self.sharkdata_url + u'list.json'))
self.dataset_dict = {}
for ds in self.datasets:
self.dataset_dict[ds[u'dataset_name']] = ds
self.dataset_list = sorted(self.dataset_dict.keys())
self.datatype_list = sorted(set([item.split('_')[1] for item in self.dataset_list]))
self.data = {}
#=======================================================================================================
def list_all_datasets_for_year(self, from_year, to_year=None, datatype=''):
print('calling list_all_dataset')
if datatype and datatype not in self.datatype_list:
print('Not a valid datatype: "{}"!'.format(datatype))
return []
years = [from_year]
if to_year:
print(from_year, to_year)
years = range(from_year, to_year+1)
years = [*years]
# Load datasets
dataset_list = []
for dataset_name in self.dataset_list:
if datatype not in dataset_name:
continue
if dataset_name.split('_')[2] in str(years):
dataset_list.append(dataset_name)
return dataset_list
#=======================================================================================================
def load_dataset(self, dataset_name):
if dataset_name not in self.dataset_list:
print('Not a valid dataset name: "{}"'.format(dataset_name))
return False
if dataset_name in self.data.keys():
print('Dataset "{}" already loaded!'.format(dataset_name))
return True
file_path = self._get_file_path(dataset_name)
header_and_data = urllib2.urlopen(file_path)
for k, line in enumerate(header_and_data):
line = line.decode(u'cp1252').strip()
if k == 0:
header = line.split('\t')
data = dict((item, []) for item in header)
else:
for h, value in zip(header, line.split('\t')):
data[h].append(value)
df = pd.DataFrame(data, columns=header)
self.data[dataset_name] = df
print('Dataset "{}" loaded.'.format(dataset_name))
return True
#=======================================================================================================
def load_dataset_for_year(self, from_year, to_year=None, datatype=''):
for dataset_name in self.list_all_datasets_for_year(from_year, to_year=to_year, datatype=datatype):
self.load_dataset(dataset_name)
#=======================================================================================================
def save_dataset_to_file(self, dataset_name, directory=''):
if dataset_name not in self.data.keys():
print('Could not save dataset "{}". Dataset NOT loaded.'.format(dataset_name))
return False
if not directory:
directory = os.getcwd()
filename = self.dataset_dict[dataset_name][u'dataset_file_name'].replace(u'.zip', u'.txt')
if directory:
file_path = os.path.join(directory, filename)
else:
file_path = filename
self.data[dataset_name].to_csv(file_path, sep='\t', encoding='cp1252', index=False)
print('Dataset "{}" saved to directory "{}"!'.format(dataset_name, directory))
return True
#=======================================================================================================
def save_all_datasets_to_file(self, directory=''):
for dataset_name in self.data.keys():
self.save_dataset_to_file(dataset_name, directory=directory)
#=======================================================================================================
def _get_file_path(self, dataset_name):
return self.sharkdata_url + dataset_name + u'/data.txt'
def execute():
"""
From sharkdata.se
Example code to get datasets from SHARKdata. Developed in Python 2.7.
For Windows users we recommend Python(x,y) to install Python: http://code.google.com/p/pythonxy/
This Python script can be executed directly in a terminal window:
$ python get_dataset_from_sharkdata.py
The following output is expected:
1. A list of all avaliable datasets. Printed in the terminal window.
2. The content of the first available dataset. Printed in the terminal window.
3. The content of the first available datasets saved as a file in the same directory as the Python script.
Character encoding in the file will be UTF-8. Change the row "character_encoding = u'utf8'" for other
encodings.
For more options, read the documentation at http://test.sharkdata.se/documentation/
"""
# URL to the datasets part of SHARKdata.
sharkdata_url = u'http://sharkdata.se/datasets/'
# Download a list of all available datasets. The JSON format is used.
datasets = json.load(urllib2.urlopen(sharkdata_url + u'list.json'))
# Exit if no datasets are found.
if len(datasets) < 1:
print(u'No datasets found. Script terminated.')
return
# Print some info for all available datasets.
print(u'\nAvailable datasets on SHARKdata:' + u'\n')
for dataset in datasets:
print(u' Datatype: ' + dataset[u'datatype'] + u' Name: ' + dataset[u'dataset_name'])
# Get the name of the first dataset in the list.
dataset_name = datasets[0][u'dataset_name']
# Download header and data and print the content. The text format is used.
print(u'\nPrint dataset content for: ' + dataset_name + u'\n')
header_and_data = urllib2.urlopen(sharkdata_url + dataset_name + u'/data.txt')
for row in header_and_data:
# The text format character encoding is cp1252 (equal to windows-1252).
row = row.decode(u'cp1252')
print(row.strip())
# Download header and data and save to file.
dataset_name = datasets[0][u'dataset_name']
filename = datasets[0][u'dataset_file_name'].replace(u'.zip', u'.txt')
character_encoding = u'utf8' # Some alternatives: cp1252, utf-8, utf-16, ascii, latin1, macroman.
row_delimiter = u'\r\n'
print(u'\nDataset content for: ' + dataset_name + u' to file: ' + filename + u'\n')
out_file = None
try:
out_file = codecs.open(filename, mode = 'w', encoding = character_encoding)
header_and_data = urllib2.urlopen(sharkdata_url + dataset_name + u'/data.txt')
for row in header_and_data:
row = row.decode(u'cp1252')
out_file.write(row.strip() + row_delimiter)
finally:
if out_file: out_file.close()
print(os.getcwd())
print(filename)
print(type(header_and_data))
# IRB: commented
#if __name__ == "__main__":
# save_directory = 'TempSharkData'
# if 1:
# sharkdata_1 = SharkData()
# datatype = 'PhysicalChemical'
# from_year = 2000
# to_year = 2018
# print('\n'.join(sharkdata_1.list_all_datasets_for_year(from_year, to_year, datatype)))
# if 0:
# sharkdata_2 = SharkData()
# dataset_name = u'SHARK_PhysicalChemical_2017_BAS_SMHI'
# sharkdata_2.load_dataset(dataset_name)
# df = sharkdata_2.data[dataset_name]
# sharkdata_2.save_dataset_to_file(dataset_name, directory=save_directory)
# print('sharkdata 2')
#
# if 0:
# sharkdata_3 = SharkData()
# datatype = 'PhysicalChemical'
#
# from_year = 1950
# to_year = 1955
# sharkdata_3.load_dataset_for_year(from_year, to_year, datatype)
# sharkdata_3.save_all_datasets_to_file(directory=save_directory)
# print('sharkdata 3')
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment