Source code for vtra.mrio.functions

# -*- coding: utf-8 -*-
"""MRIO utility functions
"""
import os

import geopandas as gpd
import pandas as pd

[docs]def load_table(data_path): """Load national Input-Output table as pandas dataframe. Parameters - file_path - String name of data path Outputs - pandas Dataframe with Input-Output table that is going to be used """ vnm_IO_path = os.path.join(data_path, "INPUT-OUTPUT TABLE 2012", "IO Table 2012 English.xlsx") return pd.read_excel(vnm_IO_path, sheet_name='IO_clean', index_col=0)
[docs]def load_sectors(data_path): """Load national Input-Output table and extracted all sectors Parameters - data_path - String name of data path Outputs - pandas Dataframe with all sectors in national Input-Output table """ vnm_IO_path = os.path.join(data_path, "INPUT-OUTPUT TABLE 2012", "IO Table 2012 English.xlsx") vnmIO_rowcol = pd.read_excel(vnm_IO_path, sheet_name='SectorName') return vnmIO_rowcol
[docs]def get_final_sector_classification(): """Return the list of sectors to be used in the new multiregional Input-Output table. Outputs: - list of sectors """ return ['secA', 'secB', 'secC', 'secD', 'secE', 'secF', 'secG', 'secH', 'secI']
[docs]def map_sectors(vnm_IO_rowcol): """Map the sectors of the loaded national Input-Output table to the sectors which are going to used in the multiregional Input-Output table. Parameters - vnm_IO_rowcol - pandas dataframe with all sectors in the national Input-Output table. Outputs - dictionary to map row sectors - dictionary to map column sectors """ row_only = vnm_IO_rowcol[vnm_IO_rowcol['mapped'].str.contains( "row") | vnm_IO_rowcol['mapped'].str.contains("sec")] col_only = vnm_IO_rowcol[vnm_IO_rowcol['mapped'].str.contains( "col") | vnm_IO_rowcol['mapped'].str.contains("sec")] return dict(zip(row_only.code, row_only.mapped)), dict(zip(col_only.code, col_only.mapped))
[docs]def aggregate_table(vnm_IO, vnm_IO_rowcol, in_million=True): """Aggregate national Input-Output table to the amount of sectors used in the multiregional Input-Output table. Parameters - vnm_IO - pandas dataframe of national Input-Output table - vnm_IO_rowcol - pandas dataframe with all sectors in the national Input-Output table - in_million - Specify whether we want to divide the table by 1000000, to have values in millions. The default value is set to **True** Outputs - pandas Dataframe with aggregated national Input-Output table """ sectors = get_final_sector_classification() # aggregate table mapper_row, mapper_col = map_sectors(vnm_IO_rowcol) vnm_IO.index = vnm_IO.index.map(mapper_row.get) vnm_IO.columns = vnm_IO.columns.to_series().map(mapper_col) aggregated = vnm_IO.groupby(vnm_IO.index, axis=0).sum().groupby( vnm_IO.columns, axis=1).sum() aggregated = aggregated.reindex(sectors+['col1', 'col2', 'col3'], axis='columns') aggregated = aggregated.reindex(sectors+['row1', 'row2', 'row3'], axis='index') if in_million == True: return aggregated/1000000 else: return aggregated
[docs]def is_balanced(io_table): """Function to check if Input-Output table is balanced. Parameters - io_table - Input-Output table. Outputs - return print statement if table is balanced. """ row = io_table.sum(axis=0) col = io_table.sum(axis=1) if ((row-col).sum() < 1): print('Table is balanced')
[docs]def load_provincial_stats(data_path): """Load shapefile with provincial-level data. Parameters - data_path - String name of data path Outputs - geopandas GeoDataFrame with provincial data. """ prov_path = os.path.join(data_path, 'Vietnam_boundaries', 'boundaries_stats', 'province_level_stats.shp') return gpd.read_file(prov_path)
[docs]def estimate_gva(regions, in_million=True): """Functions to estimate the Gross Value Added for each sector in each province. Parameters - regions - pandas DataFrame with provincial/regional data Outputs - list with GVA values per sector in each province """ if in_million == True: return list(((regions.pro_nfirm*regions.laborcost)+(regions.pro_nfirm*regions.capital))/1000000) else: return list(((regions.pro_nfirm*regions.laborcost)+(regions.pro_nfirm*regions.capital)))
[docs]def create_proxies(data_path, notrade=False, own_production_ratio=0.9, min_rice=True): """Create all proxies required in the disaggregation process. Parameters - data_path - String name of data path - notrade - Boolean to specify whether we should include trade in the disaggregation. This should be set to **True** in the first step of the disaggregation. The default is set to **False** - min_rice - Boolean to determine whether you want to use the minimal rice value or the maximum rice value from the flow analysis. The default is set to **True** - own_production_ratio - Specify how much supply and demand is locally supplied and used, and how much is imported/exported. The default is set to **0.8** Outputs - all proxy level .csv files. """ provinces = load_provincial_stats(data_path) provinces.name_eng = provinces.name_eng.apply( lambda x: x.replace(' ', '_').replace('-', '_')) od_table = load_od(data_path, min_rice=min_rice) create_indices(data_path, provinces, write_to_csv=True) create_regional_proxy(data_path, provinces, write_to_csv=True) create_sector_proxies(data_path, provinces, write_to_csv=True) create_zero_proxies(data_path, od_table, notrade=notrade, write_to_csv=True) if notrade == False: create_level14_proxies(data_path, od_table, own_production_ratio, write_to_csv=True)
[docs]def create_regional_proxy(data_path, regions, write_to_csv=True): """Function to create the proxy to disaggregate the national table to the different regions. Parameters - data_path - String name of data path - regions - pandas DataFrame with provincial/regional data - write_to_csv - Boolean to specify whether you want to save output to .csv files. The default value is set to **True** Outputs - set of .csv files with regional proxy data """ # regions['pro_nfirm']*regions['laborcost'] + regions['pro_nfirm']*regions['capital'] regions['raw_gva'] = estimate_gva(regions) subset = regions.loc[:, ['name_eng', 'raw_gva']] subset['year'] = 2010 subset['raw_gva'] = subset.raw_gva.apply(int)/(subset['raw_gva'].sum(axis='index')) subset = subset[['year', 'name_eng', 'raw_gva']] subset.columns = ['year', 'id', 'gdp'] if write_to_csv == True: csv_path = os.path.join(data_path, 'IO_analysis', 'MRIO_TABLE', 'proxy_reg_vnm.csv') subset.to_csv(csv_path, index=False)
[docs]def create_indices(data_path, provinces, write_to_csv=True): """Create list of indices required to disaggregate the national table to the different regions. Parameters - data_path - String name of data path - provinces - pandas DataFrame with provincial/regional data - write_to_csv - Boolean to specify whether you want to save output to .csv files. The default value is set to **True** Outputs - set of .csv files with indices proxy data """ # prepare index and cols region_names = list(provinces.name_eng) rowcol_names = list(load_sectors(data_path)['mapped'].unique()) rows = [x for x in rowcol_names if (x.startswith( 'sec') | x.startswith('row'))]*len(region_names) region_names_list = [item for sublist in [[x]*12 for x in region_names] for item in sublist] indices = pd.DataFrame([region_names_list, rows]).T indices.columns = ['region', 'sector'] indices['sector'] = indices['sector'].apply(lambda x: x.replace('row', 'other')) if write_to_csv == True: csv_path = os.path.join(data_path, 'IO_analysis', 'MRIO_TABLE', 'indices_mrio.csv') indices.to_csv(csv_path, index=False)
[docs]def create_sector_proxies(data_path, regions, write_to_csv=True): """Create sector proxies required to disaggregate the national table to the different sectors in each region. Parameters - data_path - String name of data path - regions - pandas DataFrame with provincial/regional data - write_to_csv - Boolean to specify whether you want to save output to .csv files. The default value is set to **True** Outputs - set of .csv files with sector proxy data """ # list of sectors sector_list = get_final_sector_classification() # get own sector classification for region file map_dict = map_sect_vnm_to_eng() regions = regions.rename(columns=map_dict) # get sectoral gva based on proportion of firms in the region sector_shares = regions[sector_list].multiply(regions['raw_gva'], axis='index') sector_shares.index = regions.name_eng for sector in sector_list+['other1', 'other2', 'other3']: if sector in ['other1', 'other2', 'other3']: subset = pd.DataFrame(sector_shares.sum(axis='columns')).divide( pd.DataFrame(sector_shares.sum(axis='columns')).sum(axis='index')) subset.columns = [sector] else: subset = pd.DataFrame(sector_shares.loc[:, sector]).divide( pd.DataFrame(sector_shares.loc[:, sector]).sum(axis='index')) subset.reset_index(inplace=True, drop=False) subset['year'] = 2010 subset['sector'] = sector+str(1) subset[sector] = subset[sector].apply(lambda x: round(x, 7)) subset = subset[['year', 'sector', 'name_eng', sector]] subset.columns = ['year', 'sector', 'region', 'gdp'] if write_to_csv == True: csv_path = os.path.join(data_path, 'IO_analysis', 'MRIO_TABLE', 'proxy_{}.csv'.format(sector)) subset.to_csv(csv_path, index=False)
[docs]def get_trade_value(x, sum_use, sector, own_production_ratio=0.8): """Function to get the trade value between a certain origin and destination. Parameters - x - row in Origin-Destination dataframe - sum_use - total use in a certain destination - own_production_ratio - Specify how much supply and demand is locally supplied and used, and how much is imported/exported. The default is set to **0.8** Outputs - returns trade value """ if x.Destination == x.Origin: try: return list(sum_use.loc[(sum_use['region'] == x.Destination) & (sum_use['sector'] == sector)]['value'])[0]*own_production_ratio except: return 1 elif x.gdp == 0: return 0 else: try: return list(sum_use.loc[(sum_use['region'] == x.Destination) & (sum_use['sector'] == sector)]['value'])[0]*(1-own_production_ratio)*x.ratio except: return 0
[docs]def create_level14_proxies(data_path, od_table, own_production_ratio=0.8, write_to_csv=True): """Function to create the level14 proxies, required to disaggregate the national table. Parameters - data_path - String name of data path - od_table - pandas DataFrame with the Origin-Destination matrix - own_production_ratio - Specify how much supply and demand is locally supplied and used, and how much is imported/exported. The default is set to **0.8** - write_to_csv - Boolean to specify whether you want to save output to .csv files. The default value is set to **True** Outputs - set of .csv files with level 14 proxy data """ # get sector list sector_list_ini = get_final_sector_classification()+['other1', 'other2', 'other3'] sector_list = [x+str(1) for x in sector_list_ini] od_table.loc[od_table['Destination'] == od_table['Origin'], 'gdp'] = 10 od_sum = pd.DataFrame(od_table.groupby(['Destination', 'Origin']).sum().sum(axis=1)) od_sum['ratio'] = od_sum.groupby(level=0).apply(lambda x: x / float(x.sum())) od_sum.reset_index(inplace=True) od_sum.columns = ['Destination', 'Origin', 'gdp', 'ratio'] df_pretable = pd.read_csv(os.path.join( data_path, 'IO_analysis', 'MRIO_TABLE', 'notrade_trade.csv'), index_col=[0, 1], header=[0, 1]) df_pretable = df_pretable.iloc[:, :567] sum_use = df_pretable.sum(axis=1) sum_use = pd.DataFrame(sum_use*0.1) sum_use.reset_index(inplace=True) sum_use.columns = ['region', 'sector', 'value'] combine = [] for sector in sector_list: if sector[:-1] in ['other1', 'other2', 'other3']: subset = od_sum.copy() subset['year'] = 2010 subset['sector'] = sector subset['gdp'] = 0 subset.drop('ratio', axis=1, inplace=True) combine.append(subset) else: subset = od_sum.copy() subset = subset.loc[od_sum.gdp != 0] subset['year'] = 2010 subset['sector'] = sector subset['gdp'] = subset.apply(lambda x: get_trade_value( x, sum_use, sector[:-1], own_production_ratio), axis=1) # subset['gdp'].apply(lambda x: round(x, 2)) subset.drop('ratio', axis=1, inplace=True) combine.append(subset) all_ = pd.concat(combine) final_sub = all_[['year', 'sector', 'Origin', 'Destination', 'gdp']] final_sub.columns = ['year', 'sector', 'region', 'region', 'gdp'] if write_to_csv == True: csv_path = os.path.join(data_path, 'IO_analysis', 'MRIO_TABLE', 'proxy_trade14_{}.csv'.format(sector[:-1])) final_sub.to_csv(csv_path, index=False)
[docs]def create_zero_proxies(data_path, od_table, notrade=False, write_to_csv=True): """Function to create the trade proxies, required to disaggregate the national table. Parameters - data_path - String name of data path - od_table - pandas DataFrame with the Origin-Destination matrix - notrade - Boolean to specify whether we should include trade in the disaggregation. This should be set to **True** in the first step of the disaggregation. The default is set to **False** - write_to_csv - Boolean to specify whether you want to save output to .csv files. The default value is set to **True** Outputs - set of .csv files with level 14 proxy data """ # get sector list sector_list = get_final_sector_classification()+['other1', 'other2', 'other3'] sector_list = [x+str(1) for x in sector_list] # map sectors to be the same mapper = map_regions() od_table['Destination'] = od_table['Destination'].apply(lambda x: mapper[x]) od_table['Origin'] = od_table['Origin'].apply(lambda x: mapper[x]) od_table = od_table.loc[od_table['Destination'] != od_table['Origin']] od_sum = pd.DataFrame(od_table.groupby(['Destination', 'Origin']).sum().sum(axis=1)) od_sum.reset_index(inplace=True) od_sum.columns = ['Destination', 'Origin', 'gdp'] if notrade == True: od_sum['gdp'] = 0 for sector in sector_list: if sector[:-1] in ['other1', 'other2', 'other3']: subset = od_sum.copy() subset['year'] = 2010 subset['sector'] = sector subset['gdp'] = 0 combine = [] for sector2 in sector_list: sub_subset = subset.copy() sub_subset['subsector'] = sector2 combine.append(sub_subset) else: subset = od_sum.copy() if notrade == False: subset = subset.loc[od_sum.gdp == 0] subset['year'] = 2010 subset['sector'] = sector subset['gdp'] = 0 combine = [] for sector2 in sector_list: sub_subset = subset.copy() sub_subset['subsector'] = sector2 combine.append(sub_subset) all_ = pd.concat(combine) final_sub = all_[['year', 'sector', 'Origin', 'subsector', 'Destination', 'gdp']] final_sub.columns = ['year', 'sector', 'region', 'sector', 'region', 'gdp'] if write_to_csv == True: csv_path = os.path.join(data_path, 'IO_analysis', 'MRIO_TABLE', 'proxy_trade_{}.csv'.format(sector[:-1])) final_sub.to_csv(csv_path, index=False)
[docs]def load_output(data_path, provinces, notrade=True): """Read output from disaggregation process and translate to usable pandas DataFrame Parameters - data_path - String name of data path - provinces - pandas DataFrame with provincial/regional data - notrade - Boolean to specify whether we should include trade in the disaggregation. This should be set to **True** in the first step of the disaggregation. The default is set to **False** Outputs - pandas DataFrame with disaggregated Input-Output table """ # prepare index and cols region_names = list(provinces.name_eng) rowcol_names = list(load_sectors(data_path)['mapped'].unique()) rows = [x for x in rowcol_names if (x.startswith( 'sec') | x.startswith('row'))]*len(region_names) cols = [x for x in rowcol_names if (x.startswith( 'sec') | x.startswith('col'))]*len(region_names) region_names_list = [item for sublist in [[x]*12 for x in region_names] for item in sublist] index_mi = pd.MultiIndex.from_arrays([region_names_list, rows], names=('region', 'row')) column_mi = pd.MultiIndex.from_arrays([region_names_list, cols], names=('region', 'col')) # read output if notrade == True: output_path = os.path.join(data_path, 'IO_analysis', 'MRIO_TABLE', 'output_notrade.csv') else: output_path = os.path.join(data_path, 'IO_analysis', 'MRIO_TABLE', 'output.csv') output_df = pd.read_csv(output_path, header=None) output_df.index = index_mi output_df.columns = column_mi # create predefined index and col, which is easier to read sector_only = [x for x in rowcol_names if x.startswith('sec')]*len(region_names) col_only = [x for x in rowcol_names if x.startswith('col')]*len(region_names) region_col = [item for sublist in [[x]*9 for x in region_names] for item in sublist] + \ [item for sublist in [[x]*3 for x in region_names] for item in sublist] column_mi_reorder = pd.MultiIndex.from_arrays( [region_col, sector_only+col_only], names=('region', 'col')) # sum va and imports tax_sub = output_df.loc[output_df.index.get_level_values(1) == 'row1'].sum(axis='index') import_ = output_df.loc[output_df.index.get_level_values(1) == 'row2'].sum(axis='index') valueA = output_df.loc[output_df.index.get_level_values(1) == 'row3'].sum(axis='index') output_new = pd.concat([output_df.loc[~output_df.index.get_level_values(1).isin(['row1', 'row2', 'row3'])], pd.DataFrame(tax_sub).T, pd.DataFrame(import_).T, pd.DataFrame(valueA).T]) # output_new = output_new.reindex(index_mi_reorder, axis='index') output_new = output_new.reindex(column_mi_reorder, axis='columns') # write to new csv output_path_new = os.path.join(data_path, 'IO_analysis', 'MRIO_TABLE', 'output_reordered.csv') output_new.to_csv(output_path_new) return output_new
[docs]def map_sect_vnm_to_eng(): """Convert vietnamese sector names to simple sector classification. Outputs - dictionary to map vietnamese sectors to simple sector names. """ map_dict = {'nongnghiep': 'secA', 'khaikhoang': 'secB', 'chebien': 'secC', 'detmay': 'secD', 'gogiay': 'secE', 'sanxuat': 'secF', 'xaydung': 'secG', 'thuongmai': 'secH', 'dichvu': 'secI'} return map_dict
[docs]def load_od(data_path, min_rice=True): """Load national Origin-Destination matrix as pandas DataFrame. Parameters - data_path - String name of data path - min_rice - Boolean to determine whether you want to use the minimal rice value or the maximum rice value from the flow analysis. The default is set to **True** Outputs - pandas DataFrame with national Origin-Destination matrix """ od_path = os.path.join(data_path, 'OD_data', 'national_scale_od_matrix.xlsx') od_table = pd.read_excel(od_path, sheet_name='total') if min_rice == True: od_table.drop(['max_rice', 'min_tons', 'max_tons'], inplace=True, axis=1) else: od_table.drop(['min_rice', 'min_tons', 'max_tons'], inplace=True, axis=1) od_table = od_table.dropna(subset=['o_region', 'd_region'], axis='index') od_table['o_region'] = od_table['d_region'].apply( lambda x: x.replace(' ', '_').replace('-', '_')) od_table['d_region'] = od_table['d_region'].apply( lambda x: x.replace(' ', '_').replace('-', '_')) od_table = od_table.rename(columns={'o_region': 'Origin', 'd_region': 'Destination'}) return od_table
[docs]def map_sectors_to_od(od_table): """Create dictionary to map products from national Origin-Destination matrix to sector classification for the Input-Output table. Parameters - od_table - pandas DataFrame with the Origin-Destination matrix Outputs - dictionary to map goods to sectors. """ goods = [x for x in od_table.columns if not ( x.startswith('Origin') | x.startswith('Destination'))] sectors_conn = ['secA', 'secC', 'secE', 'secF', 'secG', 'secF', 'secF', 'secB', 'secF', 'secF', 'secF', 'secA', 'secC'] return dict(zip(goods, sectors_conn))
[docs]def map_regions(): """Create dictionary to map regions to consistent format. Outputs - dictionary to map regions to consistent format """ return { 'An_Giang': 'An_Giang', 'Ba_Ria_Vung_Tau': 'Ba_Ria_Vung_Tau', 'Bac_Giang': 'Bac_Giang', 'Bac_Kan': 'Bac_Kan', 'Bac_Lieu': 'Bac_Lieu', 'Bac_Ninh': 'Bac_Ninh', 'Ben_Tre': 'Ben_Tre', 'Binh_Dinh': 'Binh_Dinh', 'Binh_Duong': 'Binh_Duong', 'Binh_Phuoc': 'Binh_Phuoc', 'Binh_Thuan': 'Binh_Thuan', 'Ca_Mau': 'Ca_Mau', 'Can_Tho': 'Can_Tho', 'Cao_Bang': 'Cao_Bang', 'Da_Nang': 'Da_Nang', 'Dak_Lak': 'Dak_Lak', 'Dak_Nong': 'Dak_Nong', 'Dien_Bien': 'Dien_Bien', 'Dong_Nai': 'Dong_Nai', 'Dong_Thap': 'Dong_Thap', 'Gia_Lai': 'Gia_Lai', 'Ha_Giang': 'Ha_Giang', 'Ha_Nam': 'Ha_Nam', 'Ha_Tay': 'Ha_Noi', 'Ha_Noi': 'Ha_Noi', 'Ha_Tinh': 'Ha_Tinh', 'Hai_Duong': 'Hai_Duong', 'Hai_Phong': 'Hai_Phong', 'Hau_Giang': 'Hau_Giang', 'Ho_Chi_Minh': 'Ho_Chi_Minh', 'Hoa_Binh': 'Hoa_Binh', 'Hung_Yen': 'Hung_Yen', 'Khanh_Hoa': 'Khanh_Hoa', 'Kien_Giang': 'Kien_Giang', 'Kon_Tum': 'Kon_Tum', 'Lai_Chau': 'Lai_Chau', 'Lam_Dong': 'Lam_Dong', 'Lang_Son': 'Lang_Son', 'Lao_Cai': 'Lao_Cai', 'Long_An': 'Long_An', 'Nam_Dinh': 'Nam_Dinh', 'Nghe_An': 'Nghe_An', 'Ninh_Binh': 'Ninh_Binh', 'Ninh_Thuan': 'Ninh_Thuan', 'Phu_Tho': 'Phu_Tho', 'Phu_Yen': 'Phu_Yen', 'Quang_Binh': 'Quang_Binh', 'Quang_Nam': 'Quang_Nam', 'Quang_Ngai': 'Quang_Ngai', 'Quang_Ninh': 'Quang_Ninh', 'Quang_Tri': 'Quang_Tri', 'Soc_Trang': 'Soc_Trang', 'Son_La': 'Son_La', 'Tay_Ninh': 'Tay_Ninh', 'Thai_Binh': 'Thai_Binh', 'Thai_Nguyen': 'Thai_Nguyen', 'Thanh_Hoa': 'Thanh_Hoa', 'Thua_Thien_Hue': 'Thua_Thien_Hue', 'Tien_Giang': 'Tien_Giang', 'Tra_Vinh': 'Tra_Vinh', 'Tuyen_Quang': 'Tuyen_Quang', 'Vinh_Long': 'Vinh_Long', 'Vinh_Phuc': 'Vinh_Phuc', 'Yen_Bai': 'Yen_Bai' }