6 Import - Export
6.1 Test DataFrame
R
nn <- 4L
df_r <- data.frame(INT = 1:nn, NUM = seq(1-1, nn-1, 1),
CHR = letters[1:nn], LGL = {1:nn %% 2} == 0)
str(df_r)
## 'data.frame': 4 obs. of 4 variables:
## $ INT: int 1 2 3 4
## $ NUM: num 0 1 2 3
## $ CHR: chr "a" "b" "c" "d"
## $ LGL: logi FALSE TRUE FALSE TRUE
Python
nn = 4
df_y = pd.DataFrame({'INT': [i+1 for i in range(nn)],
'NUM': np.arange(0.0, nn),
'CHR': [chr(i) for i in range(ord('a'), ord('a') +nn)],
'LGL': [i % 2 == 1 for i in range(nn)]})
df_y.info(memory_usage = False)
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 4 entries, 0 to 3
## Data columns (total 4 columns):
## # Column Non-Null Count Dtype
## --- ------ -------------- -----
## 0 INT 4 non-null int64
## 1 NUM 4 non-null float64
## 2 CHR 4 non-null object
## 3 LGL 4 non-null bool
## dtypes: bool(1), float64(1), int64(1), object(1)
6.2 CSV
- R
write.csv()
&read.csv()
are available in R. However readr package provides fasterreadr::write_csv()
&readr::read_csv()
methods - Python pandas module has to_csv() & read_csv()
- Note: If rows have different number of columns
JSON
would be a better choice compared toCSV
R
loc <- 'data/R_01_readr.csv' #PATH
readr::write_csv(df_r, file = loc) #To CSV
aa <- readr::read_csv(loc, show_col_types = FALSE,
col_types = list(INT = readr::col_integer(), NUM = col_double(),
CHR = col_character(), LGL = col_logical()))
attr(aa, 'spec') <- NULL #Drop Attribute
if(nrow(readr::problems(aa)) == 0L) attr(aa, 'problems') <- NULL
stopifnot(identical(df_r, as.data.frame(aa)))
6.3 R RDS
- R
saveRDS()
&readRDS()
are available in R. However readr package provides fasterreadr::write_rds()
&readr::read_rds()
methods - Python needs rpy2 module which is unavailable in Windows
6.4 Python Arrow Feather
- Python pyarrow module has write_feather() & read_feather()
- Pandas itself has moved away from
pickle
(&msgpack
) topyarrow
- Note: Within reticulate environment, due to the conflict between Python pyarrow module and R arrow package, the R package is not being used for now.
6.5 R Dump
- R
dump()
creates.R
file with thestructure()
of all the objects passed to it. This file can be sourced bysource()
. - Unlike
saveRDS()
multiple objects can be saved. - Caution: It also saves ‘object names’ thus it may overwrite already existing ones.
6.6 Python Pickle & HDF5
Python pickle module can be used to save python objects. However, it is unsecure and arbitrary code execution is possible (Pickle Flaws).
Pandas uses pickle (via PyTables) for reading and writing HDF5
files. So avoid this too. Caution
6.7 Clipboard
- R
read.delim()
is available. However readr package provides easierreadr::read_delim()
usingreadr::clipboard()
method - Python pandas module has read_clipboard()
R
if(FALSE) print(df_r) #Inconsistent separator
cat(readr::format_csv(df_r)) #Output without '#' for easy copy to clipboard
INT,NUM,CHR,LGL
1,0,a,FALSE
2,1,b,TRUE
3,2,c,FALSE
4,3,d,TRUE
if(FALSE) {
aa <- readr::read_delim(clipboard()) #PATH: 'data/R_01_readr.csv'
aa$INT <- as.integer(aa$INT)
attr(aa, 'spec') <- NULL
if(nrow(readr::problems(aa)) == 0L) attr(aa, 'problems') <- NULL
stopifnot(identical(df_r, as.data.frame(aa)))
dput(aa)
}
Python
print(df_y) #Output without '#' for easy copy to clipboard
INT NUM CHR LGL
0 1 0.0 a False
1 2 1.0 b True
2 3 2.0 c False
3 4 3.0 d True
if(False):
if 'pp' in globals(): del pp
pp = pd.read_clipboard() #PATH: 'data/Y_01_pandas.csv'
pp['NUM'] = pp['NUM'].astype('float64')
pp.info(memory_usage = False)
assert(df_y.equals(pp))
print(pp)
6.8 Scripts .r & .py
R has
writeLines()
,file.exists()
,file.remove()
& other related functions for file operations andsource()
to execute.r
scripts-
Note: Script execution is similar in R and Python i.e. On Error, next line is not executed. However, Chunk execution is different i.e. in the R chunk next line is not executed whereas in the Python chunk it will be executed even after Error is thrown.
- Thus, in R chunk,
stopifnot(FALSE)
is enough to stop chunk execution - Whereas, in Python,
assert(False)
,exit(1)
,quit(1)
,sys.exit(1)
,raise SystemExit
do not prevent further execution - On the other hand
os._exit(1)
not only quits the Python but also the R session. So, this should be avoided.
- Thus, in R chunk,
R
if(FALSE){# Avoid writing executable scripts to directories
loc <- 'data/R_02_script.r'
txt <- "stopifnot(TRUE) # Execution stops on ERROR\nprint('Hello')"
writeLines(txt, loc) #Create R Script
source(loc) #Source R Script
if(file.exists(loc)) file.remove(loc) #Delete with no recovery
}
6.9 Standard Datasets
R
data(package = 'dplyr')$results[ , 'Item'] #Load or List Datasets
## [1] "band_instruments" "band_instruments2" "band_members"
## [4] "starwars" "storms"
dim(dplyr::storms)
## [1] 19537 13
loc <- 'data/R_03_iris.rds' #PATH
if(!exists(loc)) {# Headers | Replace '.' by '_' | To lowercase
aa <- datasets::iris |> rename_with(make.names) |>
rename_with(~ tolower(gsub('.', '_', .x, fixed = TRUE)))
readr::write_rds(aa, file = loc) #Dataset: Iris
} else {
aa <- readr::read_rds(loc)
}
str(aa, vec.len = 2)
## 'data.frame': 150 obs. of 5 variables:
## $ sepal_length: num 5.1 4.9 4.7 4.6 5 ...
## $ sepal_width : num 3.5 3 3.2 3.1 3.6 ...
## $ petal_length: num 1.4 1.4 1.3 1.5 1.4 ...
## $ petal_width : num 0.2 0.2 0.2 0.2 0.2 ...
## $ species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 ...
Python
loc = 'data/Y_03_iris.feather' #PATH
if(not os.path.exists(loc)):
pp = sns.load_dataset('iris') #Needs Internet
list(pp.columns)
#['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
qq = sm.datasets.get_rdataset('iris').data #Needs Internet
qq.columns = pp.columns
#dir(sklearn.datasets) #All Datasets
ss = sklearn.datasets.load_iris(as_frame = True).data #Offline, No Target
tt = sklearn.datasets.load_iris() #Offline, Bunch
uu = pd.DataFrame(data = np.c_[tt['data'], tt['target']],
columns = tt['feature_names'] + ['target']).astype({'target': int}) \
.assign(species = lambda x: x['target'].map(
dict(enumerate(tt['target_names'])))) \
.drop('target', axis = 1)
uu.columns = pp.columns
assert(uu.equals(pp) and uu.equals(qq))
pyarrow.feather.write_feather(uu, loc) #Dataset: Iris
else:
uu = pyarrow.feather.read_feather(loc)
uu.info(memory_usage = False)
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 150 entries, 0 to 149
## Data columns (total 5 columns):
## # Column Non-Null Count Dtype
## --- ------ -------------- -----
## 0 sepal_length 150 non-null float64
## 1 sepal_width 150 non-null float64
## 2 petal_length 150 non-null float64
## 3 petal_width 150 non-null float64
## 4 species 150 non-null object
## dtypes: float64(4), object(1)