6 Import - Export

6.1 Test DataFrame

nn <- 4L
df_r <- data.frame(INT = 1:nn, NUM = seq(1-1, nn-1, 1), 
                   CHR = letters[1:nn], LGL = {1:nn %% 2} == 0)
str(df_r)
## 'data.frame':    4 obs. of  4 variables:
##  $ INT: int  1 2 3 4
##  $ NUM: num  0 1 2 3
##  $ CHR: chr  "a" "b" "c" "d"
##  $ LGL: logi  FALSE TRUE FALSE TRUE

Python

nn = 4
df_y = pd.DataFrame({'INT': [i+1 for i in range(nn)], 
      'NUM': np.arange(0.0, nn),
      'CHR': [chr(i) for i in range(ord('a'), ord('a') +nn)],
      'LGL': [i % 2 == 1 for i in range(nn)]})
df_y.info(memory_usage = False)
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 4 entries, 0 to 3
## Data columns (total 4 columns):
##  #   Column  Non-Null Count  Dtype  
## ---  ------  --------------  -----  
##  0   INT     4 non-null      int64  
##  1   NUM     4 non-null      float64
##  2   CHR     4 non-null      object 
##  3   LGL     4 non-null      bool   
## dtypes: bool(1), float64(1), int64(1), object(1)

6.2 CSV

R write.csv() & read.csv() are available in R. However readr package provides faster readr::write_csv() & readr::read_csv() methods
Python pandas module has to_csv() & read_csv()
Note: If rows have different number of columns JSON would be a better choice compared to CSV

loc <- 'data/R_01_readr.csv'                                #PATH
readr::write_csv(df_r, file = loc)                          #To CSV

aa <- readr::read_csv(loc, show_col_types = FALSE, 
              col_types = list(INT = readr::col_integer(), NUM = col_double(), 
                               CHR = col_character(), LGL = col_logical()))
attr(aa, 'spec') <- NULL                                    #Drop Attribute
if(nrow(readr::problems(aa)) == 0L) attr(aa, 'problems') <- NULL
stopifnot(identical(df_r, as.data.frame(aa)))

Python

loc = 'data/Y_01_pandas.csv'                                #PATH
df_y.to_csv(loc, index = False)                             #To CSV

if 'pp' in globals(): del pp
pp = pd.read_csv(loc, dtype = {'INT': np.int64, 'NUM': float, 
                               'CHR': object, 'LGL': bool})
assert(df_y.equals(pp))

6.3 R RDS

R saveRDS() & readRDS() are available in R. However readr package provides faster readr::write_rds() & readr::read_rds() methods
Python needs rpy2 module which is unavailable in Windows

loc <- 'data/R_01_readr.rds'                                #PATH
readr::write_rds(df_r, file = loc)                          #To RDS

aa <- readr::read_rds(loc)
stopifnot(identical(df_r, aa))

6.4 Python Arrow Feather

Python pyarrow module has write_feather() & read_feather()
Pandas itself has moved away from pickle (& msgpack) to pyarrow
Note: Within reticulate environment, due to the conflict between Python pyarrow module and R arrow package, the R package is not being used for now.

Python

loc = 'data/Y_01_pyarrow.feather'                           #PATH
pyarrow.feather.write_feather(df_y, loc)                    #To Feather

if 'pp' in globals(): del pp
pp = pyarrow.feather.read_feather(loc)
assert(df_y.equals(pp))

6.5 R Dump

R dump() creates .R file with the structure() of all the objects passed to it. This file can be sourced by source().
Unlike saveRDS() multiple objects can be saved.
Caution: It also saves ‘object names’ thus it may overwrite already existing ones.

aa <- df_r
loc <- 'data/R_01_dump.r'
dump(c('aa'), file = loc)                                   #To R File
rm(aa)
source(loc)                                                 #Source
stopifnot(all(exists('aa'), identical(df_r, aa)))

6.6 Python Pickle & HDF5

Python pickle module can be used to save python objects. However, it is unsecure and arbitrary code execution is possible (Pickle Flaws). Pandas uses pickle (via PyTables) for reading and writing HDF5 files. So avoid this too. Caution

Python

if(False):
    # Avoid Pickle
    loc = 'data/Y_01_pickle.pkl'
    df_y.to_pickle(loc)                                     #To Pickle
    
    if 'pp' in globals(): del pp
    pp = pd.read_pickle(loc)
    assert(df_y.equals(pp))

6.7 Clipboard

R read.delim() is available. However readr package provides easier readr::read_delim() using readr::clipboard() method
Python pandas module has read_clipboard()

if(FALSE) print(df_r)         #Inconsistent separator
cat(readr::format_csv(df_r))  #Output without '#' for easy copy to clipboard
INT,NUM,CHR,LGL
1,0,a,FALSE
2,1,b,TRUE
3,2,c,FALSE
4,3,d,TRUE

if(FALSE) {
  aa <- readr::read_delim(clipboard())            #PATH: 'data/R_01_readr.csv'
  aa$INT <- as.integer(aa$INT)
  attr(aa, 'spec') <- NULL
  if(nrow(readr::problems(aa)) == 0L) attr(aa, 'problems') <- NULL
  stopifnot(identical(df_r, as.data.frame(aa)))
  dput(aa)
}

Python

print(df_y)                   #Output without '#' for easy copy to clipboard
   INT  NUM CHR    LGL
0    1  0.0   a  False
1    2  1.0   b   True
2    3  2.0   c  False
3    4  3.0   d   True

if(False):
    if 'pp' in globals(): del pp
    pp = pd.read_clipboard()                      #PATH: 'data/Y_01_pandas.csv'
    pp['NUM'] = pp['NUM'].astype('float64')
    pp.info(memory_usage = False)
    assert(df_y.equals(pp))
    print(pp)

6.8 Scripts .r & .py

R has writeLines(), file.exists(), file.remove() & other related functions for file operations and source() to execute .r scripts
Python has open(), with(), exists(), remove()
Note: Script execution is similar in R and Python i.e. On Error, next line is not executed. However, Chunk execution is different i.e. in the R chunk next line is not executed whereas in the Python chunk it will be executed even after Error is thrown.
- Thus, in R chunk, stopifnot(FALSE) is enough to stop chunk execution
- Whereas, in Python, assert(False), exit(1), quit(1), sys.exit(1), raise SystemExit do not prevent further execution
- On the other hand os._exit(1) not only quits the Python but also the R session. So, this should be avoided.

if(FALSE){# Avoid writing executable scripts to directories
  loc <- 'data/R_02_script.r'
  txt <- "stopifnot(TRUE) # Execution stops on ERROR\nprint('Hello')"
  writeLines(txt, loc)                            #Create R Script
  source(loc)                                     #Source R Script
  if(file.exists(loc)) file.remove(loc)           #Delete with no recovery  
}

Python

if(False): # Avoid writing executable scripts to directories
    loc = 'data/Y_02_script.py'
    with open(loc, 'w') as f:
        f.write("assert(True) # Execution stops on ERROR\nprint('Hello')")
    exec(open(loc).read())                        #Execute Python Script
    if(os.path.exists(loc)): os.remove(loc)

6.9 Standard Datasets

data(package = 'dplyr')$results[ , 'Item']        #Load or List Datasets
## [1] "band_instruments"  "band_instruments2" "band_members"     
## [4] "starwars"          "storms"

dim(dplyr::storms)
## [1] 19537    13

loc <- 'data/R_03_iris.rds'                                 #PATH
if(!exists(loc)) {# Headers | Replace '.' by '_' | To lowercase 
    aa <- datasets::iris |> rename_with(make.names) |> 
      rename_with(~ tolower(gsub('.', '_', .x, fixed = TRUE)))
    readr::write_rds(aa, file = loc)                        #Dataset: Iris
} else {
    aa <- readr::read_rds(loc) 
}

str(aa, vec.len = 2)
## 'data.frame':    150 obs. of  5 variables:
##  $ sepal_length: num  5.1 4.9 4.7 4.6 5 ...
##  $ sepal_width : num  3.5 3 3.2 3.1 3.6 ...
##  $ petal_length: num  1.4 1.4 1.3 1.5 1.4 ...
##  $ petal_width : num  0.2 0.2 0.2 0.2 0.2 ...
##  $ species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 ...

Python

loc = 'data/Y_03_iris.feather'                              #PATH
if(not os.path.exists(loc)):
    pp = sns.load_dataset('iris')                           #Needs Internet
    list(pp.columns) 
    #['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
    qq = sm.datasets.get_rdataset('iris').data              #Needs Internet
    qq.columns = pp.columns
    #dir(sklearn.datasets)                                  #All Datasets
    ss = sklearn.datasets.load_iris(as_frame = True).data   #Offline, No Target
    tt = sklearn.datasets.load_iris()                       #Offline, Bunch
    uu = pd.DataFrame(data = np.c_[tt['data'], tt['target']],
          columns = tt['feature_names'] + ['target']).astype({'target': int}) \
          .assign(species = lambda x: x['target'].map(
                            dict(enumerate(tt['target_names'])))) \
          .drop('target', axis = 1)
    uu.columns = pp.columns
    assert(uu.equals(pp) and uu.equals(qq))
    pyarrow.feather.write_feather(uu, loc)                  #Dataset: Iris
else:
    uu = pyarrow.feather.read_feather(loc)


uu.info(memory_usage = False)
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 150 entries, 0 to 149
## Data columns (total 5 columns):
##  #   Column        Non-Null Count  Dtype  
## ---  ------        --------------  -----  
##  0   sepal_length  150 non-null    float64
##  1   sepal_width   150 non-null    float64
##  2   petal_length  150 non-null    float64
##  3   petal_width   150 non-null    float64
##  4   species       150 non-null    object 
## dtypes: float64(4), object(1)

5 Matrix & NumPy Array

7 Window Functions