4 Data Frame

4.1 Basics

Indexing starts from 1 in R and from 0 in Python.

R provides data.frame() for tabular data structure. tibble & data.table are packages which extends its capabilities. Python module pandas provide similar capabilities

R data.frame() is a list() of variables of the same number of rows. It is a matrix() like structure whose columns may be of differing types. Similarly, Python Pandas DataFrame() is a 2-dimensional data structure that can store data of different types in columns.

R

nn <- 4L                                          #Number of Rows

# R Data Frame: integer, double, character, logical, factor
df_r <- data.frame(
  INT = 1:nn, NUM = seq(1, nn, 1), CHR = letters[1:nn], LGL = (1:nn %% 2) == 0, 
  FCT = factor(rep(c('No', 'Yes'), length.out = nn)))

# Tibble
tbl <- tibble::tibble(
  INT = 1:nn, NUM = seq(1, nn, 1), CHR = letters[1:nn], LGL = (1:nn %% 2) == 0, 
  FCT = factor(rep(c('No', 'Yes'), length.out = nn)))

stopifnot(all(identical(df_r, as.data.frame(tbl)), 
              identical(tbl, tibble::as_tibble(df_r))))

Python

pp = 4                                            #Number of Rows

qq = {'INT': [i+1 for i in range(pp)], 
      'NUM': (float(i+1) for i in range(pp)),
      'CHR': [chr(i) for i in range(ord('a'), ord('a') +pp)],
      'LGL': [i % 2 == 1 for i in range(pp)],
      'FCT': pd.Categorical(['No', 'Yes'] * 2)}

df_y = pd.DataFrame(data = qq)                    #DataFrame from dict

R

aa <- df_r
if(FALSE) print(aa)                     #Data Frame prints ALL Rows (Avoid)
if(FALSE) print(tbl, n = 2)             #Tibble can take number of rows to print

stopifnot(identical(dplyr::slice(aa, 1:2), head(aa, 2)))
head(aa, 2)                                       #Subset by Head
##   INT NUM CHR   LGL FCT
## 1   1   1   a FALSE  No
## 2   2   2   b  TRUE Yes

tail(aa, 2)                                       #Subset by Tail
##   INT NUM CHR   LGL FCT
## 3   3   3   c FALSE  No
## 4   4   4   d  TRUE Yes

Python

pp = df_y.copy()

assert(pp.head(2).equals(pp.iloc[:2]))
pp.head(2)
##    INT  NUM CHR    LGL  FCT
## 0    1  1.0   a  False   No
## 1    2  2.0   b   True  Yes


pp.tail(2)
##    INT  NUM CHR    LGL  FCT
## 2    3  3.0   c  False   No
## 3    4  4.0   d   True  Yes

R

aa <- df_r
class(aa)                                         #Class
## [1] "data.frame"

typeof(aa)                                        #Type
## [1] "list"

dim(aa)                                           #Dimensions [Row, Column]
## [1] 4 5

names(aa)                                         #Column Headers
## [1] "INT" "NUM" "CHR" "LGL" "FCT"

Python

pp = df_y.copy()

print(type(pp))                                   #Explicitly Print type()
## <class 'pandas.core.frame.DataFrame'>


pp.shape                                          #Dimensions [Row, Column]
## (4, 5)


list(pp.columns)                                  #Column Headers
## ['INT', 'NUM', 'CHR', 'LGL', 'FCT']

R

aa <- df_r
str(aa)                                           #Structure
## 'data.frame':    4 obs. of  5 variables:
##  $ INT: int  1 2 3 4
##  $ NUM: num  1 2 3 4
##  $ CHR: chr  "a" "b" "c" "d"
##  $ LGL: logi  FALSE TRUE FALSE TRUE
##  $ FCT: Factor w/ 2 levels "No","Yes": 1 2 1 2

summary(aa)                                       #Summary
##       INT            NUM           CHR               LGL           FCT   
##  Min.   :1.00   Min.   :1.00   Length:4           Mode :logical   No :2  
##  1st Qu.:1.75   1st Qu.:1.75   Class :character   FALSE:2         Yes:2  
##  Median :2.50   Median :2.50   Mode  :character   TRUE :2                
##  Mean   :2.50   Mean   :2.50                                             
##  3rd Qu.:3.25   3rd Qu.:3.25                                             
##  Max.   :4.00   Max.   :4.00

Python

pp = df_y.copy()
list(pp.describe().index)                         #(Default) Summary of Num only
## ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
[x for x in pp.describe(include = 'all').index if x not in pp.describe().index]
## ['unique', 'top', 'freq']

pp.describe(include = 'all').loc[['count', 'max', 'unique']]
##         INT  NUM  CHR  LGL  FCT
## count   4.0  4.0    4    4    4
## max     4.0  4.0  NaN  NaN  NaN
## unique  NaN  NaN    4    2    2


pp.dtypes                                         #data type of each column
## INT       int64
## NUM     float64
## CHR      object
## LGL        bool
## FCT    category
## dtype: object


pp.info(memory_usage = False)                     #Structure
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 4 entries, 0 to 3
## Data columns (total 5 columns):
##  #   Column  Non-Null Count  Dtype   
## ---  ------  --------------  -----   
##  0   INT     4 non-null      int64   
##  1   NUM     4 non-null      float64 
##  2   CHR     4 non-null      object  
##  3   LGL     4 non-null      bool    
##  4   FCT     4 non-null      category
## dtypes: bool(1), category(1), float64(1), int64(1), object(1)

Python

#Prevent the collapse of middle rows or columns into (...)
if(False):
    with pd.option_context('display.max_rows', None, 
                           'display.max_columns', None):
        print(pp.describe(include = 'all'))

R

aa <- df_r
names(aa)
## [1] "INT" "NUM" "CHR" "LGL" "FCT"

bb <- dplyr::select(aa, 2:3)                                #Select by Position
dd <- select(aa, NUM, CHR)                                  #Select by Name
ee <- select(aa, -c(INT, LGL, FCT))                         #Drop Columns

ff <- data.frame('NUM' = aa$NUM, 'CHR' = aa$CHR)
# with() can be used to create an environment using data
gg <- with(aa, data.frame(NUM, CHR))
# [] is used for subsetting but note that 1-column subset is vector by default
hh <- aa[ , c('NUM', 'CHR'), drop = FALSE]
ii <- subset(aa, select = c(NUM, CHR), drop = FALSE)        #Avoid

stopifnot(all(sapply(list(dd, ee, ff, gg, hh, ii), identical, bb)))
str(bb)
## 'data.frame':    4 obs. of  2 variables:
##  $ NUM: num  1 2 3 4
##  $ CHR: chr  "a" "b" "c" "d"

Python

pp = df_y.copy()
list(pp.columns)
## ['INT', 'NUM', 'CHR', 'LGL', 'FCT']

qq = pp[['NUM', 'CHR']].copy()                              #Use List of Names
ss = pp.drop(columns = ['INT', 'LGL', 'FCT']).copy()        #Drop Columns
tt = pp.drop(['INT', 'LGL', 'FCT'], axis = 1).copy()        #0 Rows, 1 Columns
uu = pp.filter(['NUM', 'CHR']).copy()

assert(qq.equals(ss) and qq.equals(tt) and qq.equals(uu))
qq
##    NUM CHR
## 0  1.0   a
## 1  2.0   b
## 2  3.0   c
## 3  4.0   d

R

aa <- df_r
names(aa)
## [1] "INT" "NUM" "CHR" "LGL" "FCT"

names(aa)[c(1, 3)] <- c('A', 'C')                 #By Position
names(aa)[names(aa) == 'NUM'] <- 'B'              #By Name

aa <- dplyr::rename(aa, D = LGL, E = 5)           #New = Old (Reverse of Python)
names(aa)
## [1] "A" "B" "C" "D" "E"

Python

pp = df_y.copy()
list(pp.columns)
## ['INT', 'NUM', 'CHR', 'LGL', 'FCT']


pp.rename(columns = {'INT': 'A', pp.columns[1]: 'B'}, inplace = True) #Old: New
pp.rename(columns = dict(zip(pp.columns[[3]], ['D'])),inplace = True) #Old, New
pp.columns.values[[2, 4]] = ['C', 'E']

list(pp.columns)
## ['A', 'B', 'C', 'D', 'E']

R

aa <- df_r

# order() ALWAYS reorders the rownames
bb <- aa[order(aa$CHR, decreasing = TRUE), ]

# arrange() reorders character rownames but reinitialises them from 1 if integer
dd <- dplyr::arrange(aa, dplyr::desc(CHR))
row.names(dd) <- 4:1
stopifnot(identical(bb, dd))
dd
##   INT NUM CHR   LGL FCT
## 4   4   4   d  TRUE Yes
## 3   3   3   c FALSE  No
## 2   2   2   b  TRUE Yes
## 1   1   1   a FALSE  No

Python

pp = df_y.copy()
list(pp.index)
## [0, 1, 2, 3]


pp.sort_values('CHR', ascending = False, inplace = True)
list(pp.index)
## [3, 2, 1, 0]

4.2 RowNames - Index

R

aa <- data.frame(x = 1:2)
row.names(aa)
## [1] "1" "2"

row.names(aa) <- letters[1:2]                               #rownames

stopifnot(tibble::has_rownames(aa))
row.names(aa)
## [1] "a" "b"

Python

pp = pd.DataFrame(data = {'x': [1, 2]})
list(pp.index)
## [0, 1]


pp.set_index([pd.Index(['a', 'a'])], inplace = True)        #Duplicate index
pp.set_index([pd.Index(['a', 'b'])], inplace = True)
list(pp.index) 
## ['a', 'b']

4.3 Copy

R

george <- fred <- data.frame(x = 11:13)                     #R Copy-on-Modify

# Before modification both bind to the same address (unlike Python)
stopifnot(identical(obj_addr(fred), obj_addr(george)))

aa <- obj_addr(fred)                    #Address before modification

fred[2, 'x'] <- NA                      #Modify
fred$x
## [1] 11 NA 13

stopifnot(obj_addr(fred) != aa)     #Bind to a different address (unlike Python)

# No change in non-modified object (george) address or value (same as Python)
stopifnot(obj_addr(george) == aa)

Python

fred = pd.DataFrame({'x': [1, 2, 3]})
george = fred.copy()                              #Deepcopy by default
percy  = copy.deepcopy(fred)                      #Deepcopy

# Deepcopy bind to different address even before modification (unlike R)
assert(id(fred) != id(george) != id(percy))

pp = id(fred)                           #Address before modification

fred.at[1, 'x'] = None                  #Modify
list(fred['x'])
## [1.0, nan, 3.0]


assert(id(fred) == pp)                  #No change in address (unlike R)

# No change in non-modified objects (george, percy) address or value (same as R)
assert(george.equals(percy) and not george.equals(fred))

4.4 Modify

  • R [] (base::Extract)
  • Python:
    • at can only access a single value at a time (faster). It tries to maintain the datatype (fails sometimes). If column number is used by mistake, it creates a New Column (Avoid)
    • loc can select multiple rows and/or columns (slower). It does not maintain the datatype and modifies the type silently.
    • iat, iloc are indices variants of the above

R

aa <- df_r
aa[1, 2] <- NA
aa[2, 'CHR'] <- NA
aa
##   INT NUM  CHR   LGL FCT
## 1   1  NA    a FALSE  No
## 2   2   2 <NA>  TRUE Yes
## 3   3   3    c FALSE  No
## 4   4   4    d  TRUE Yes

Python

pp = df_y.copy()
pp.at[0, 'NUM'] = None
pp.loc[1, 'CHR'] = None
pp.iat[3, 1] = 10.0
pp.iloc[2, 0] = 30
pp
##    INT   NUM   CHR    LGL  FCT
## 0    1   NaN     a  False   No
## 1    2   2.0  None   True  Yes
## 2   30   3.0     c  False   No
## 3    4  10.0     d   True  Yes

4.5 Merge, Join

  • A mutating join allows you to combine variables from two tables. (Advanced R - Hadley)
    • These are inner join, left outer join, right outer join, full outer join, cross join
  • An inner join matches pairs of observations whenever their keys are equal. Thus, unmatched rows are not included in the result.
    • This means that generally inner joins are usually not appropriate for use in analysis because it is too easy to lose observations.
  • An outer join keeps observations that appear in at least one of the tables.
    • A left join keeps all observations in x.
    • A right join keeps all observations in y.
    • A full join keeps all observations in x and y.
    • A cross join returns the Cartesian product of rows from both tables.
  • See Table 4.1 for Joins of R & Python
  • R: merge(), dplyr::inner_join(), dplyr::left_join(), dplyr::right_join(), dplyr::full_join()
  • Python: merge() (SO)

R

aa <- tibble(ID = c(1, 2, 3), A = c('a1', 'a2', 'a3'))
bb <- tibble(ID = c(1, 2, 4), B = c('b1', 'b2', 'b4'))

# Inner Join
ab_j_inner <- dplyr::inner_join(aa, bb, by = 'ID') 
ab_m_inner <- merge(aa, bb, by = 'ID')

# Left Outer Join
ab_j_left <- dplyr::left_join(aa, bb, by = 'ID')
ab_m_left <- merge(aa, bb, by = 'ID', all.x = TRUE)

# Right Outer Join
ab_j_right <- dplyr::right_join(aa, bb, by = 'ID')
ab_m_right <- merge(aa, bb, by = 'ID', all.y = TRUE)

# Full Outer Join
ab_j_full <- dplyr::full_join(aa, bb, by = 'ID')
ab_m_full <- merge(aa, bb, by = 'ID', all = TRUE)

# Cross Join
ab_j_cross <- full_join(aa, bb, by = character(), suffix = c('_x', '_y'))
ab_m_cross <- merge(aa, bb, by=NULL, suffixes = c('_x', '_y')) |> arrange(ID_x)

Python

pp = pd.DataFrame({'ID': (1, 2, 3), 'A': ('a1', 'a2', 'a3')}) 
qq = pd.DataFrame({'ID': (1, 2, 4), 'B': ('b1', 'b2', 'b4')}) 

pq_inner = pp.merge(qq, on = 'ID', how = 'inner')           #Inner Join
pq_left  = pp.merge(qq, on = 'ID', how = 'left')            #Left Outer Join
pq_right = pp.merge(qq, on = 'ID', how = 'right')           #Right Outer Join
pq_full  = pp.merge(qq, on = 'ID', how = 'outer')           #Full Outer Join
pq_cross = pp.merge(qq, how = 'cross')                      #Cross Join

Table 4.1: Inner Join, Left Outer Join, Right Outer Join, Full Outer Join, Cross Join
ID A B
1 a1 b1
2 a2 b2
ID A B
1 a1 b1
2 a2 b2
3 a3 NA
ID A B
1 a1 b1
2 a2 b2
4 NA b4
ID A B
1 a1 b1
2 a2 b2
3 a3 NA
4 NA b4
ID_x A ID_y B
1 a1 1 b1
1 a1 2 b2
1 a1 4 b4
2 a2 1 b1
2 a2 2 b2
2 a2 4 b4
3 a3 1 b1
3 a3 2 b2
3 a3 4 b4
  • Filtering joins match observations in the same way as mutating joins, but affect the observations, not the variables.
    • semi_join(x, y) keeps all observations in x that have a match in y.
    • anti_join(x, y) drops all observations in x that have a match in y.

R

ab_j_semi <- semi_join(aa, bb, by = 'ID')                   # Semi Join
ab_j_anti <- anti_join(aa, bb, by = 'ID')                   # Anti Join

Table 4.2: Semi Join, Anti Join
ID A
1 a1
2 a2
ID A
3 a3

4.6 Sets

  • These operations work with a complete row, comparing the values of every variable.
    • Thus, these expect both tables /df to have the same variables (columns), and treat the observations (rows) like sets.
    • Inner Join vs. Intersect
      • The INNER JOIN will return duplicates, if id is duplicated in either table. INTERSECT removes duplicates.
      • The INNER JOIN will never return NULL, but INTERSECT will return NULL.
    • Inner Join vs. semi-join
      • With a semi-join, each record in the first set is returned only once, regardless of how many matches there are in the second set.

R

aa <- tibble(A = c(1, 2), B = c(1, 1))
bb <- tibble(A = c(1, 1), B = c(1, 2))

ab_isect <- intersect(aa, bb)                               # Intersection
ab_union <- union(aa, bb)                                   # Union
ab_sdiff <- setdiff(aa, bb)                                 # x - y
ba_sdiff <- setdiff(bb, aa)                                 # y - x

Table 4.3: Intersect, Union, x-y, y-x
A B
1 1
A B
1 1
2 1
1 2
A B
2 1
A B
1 2

R

# Two 2 in First & Three 3 in Second
aa <- tibble(ID = c(1, 2, 2, 3, 4), 
             A = c('a1', 'a21', 'a22', 'a31', 'a4'))
bb <- tibble(ID = c(1, 2, 3, 3, 3, 5), 
             A = c('a1', 'a21', 'a31', 'a32', 'a33', 'a5'))
#
ii_inn <- inner_join(aa, bb, by = 'ID') 
jj_its <- intersect(aa, bb)
kk_sem <- semi_join(aa, bb, by = 'ID')
#
str(ii_inn, vec.len = nrow(ii_inn))     #Duplicate IDs of Both
## tibble [6 × 3] (S3: tbl_df/tbl/data.frame)
##  $ ID : num [1:6] 1 2 2 3 3 3
##  $ A.x: chr [1:6] "a1" "a21" "a22" "a31" "a31" "a31"
##  $ A.y: chr [1:6] "a1" "a21" "a21" "a31" "a32" "a33"
str(jj_its, vec.len = nrow(jj_its))     #No Duplicate IDs of Either
## tibble [3 × 2] (S3: tbl_df/tbl/data.frame)
##  $ ID: num [1:3] 1 2 3
##  $ A : chr [1:3] "a1" "a21" "a31"
str(kk_sem, vec.len = nrow(kk_sem))     #Duplicates of First found in Second
## tibble [4 × 2] (S3: tbl_df/tbl/data.frame)
##  $ ID: num [1:4] 1 2 2 3
##  $ A : chr [1:4] "a1" "a21" "a22" "a31"
setdiff(kk_sem, jj_its)                 #Extra in Semi Join over Intersection
## # A tibble: 1 × 2
##      ID A    
##   <dbl> <chr>
## 1     2 a22

4.7 Missing Values

  • R: NA represents the missing values.
    • NULL assignment to a data.frame element is problematic (SO), (SO). The column needs to be a list because vector cannot contain NULL. Further, list() has to be used instead of c() because NULL cannot be concatenated.
  • Python Pandas DataFrame() treats None in numeric columns as np.nan. It keeps None or np.nan in object columns as it is, however, internally these all are treated as missing values
  • Note: reticulate currently converts object columns with None or np.nan to list containing NULL or NaN. It also converts np.nan in numeric column to NaN. All of these should be NA

R

# NULL cannot be concatenated
c(NULL, NULL)
## NULL
list(NULL, NULL)
## [[1]]
## NULL
## 
## [[2]]
## NULL

# Only list can contain NULL
aa <- data.frame(x = 1:3, y = I(list(NULL, 'b', 'c')))
aa$x <- list(4, NULL, 6)
str(aa)
## 'data.frame':    3 obs. of  2 variables:
##  $ x:List of 3
##   ..$ : num 4
##   ..$ : NULL
##   ..$ : num 6
##  $ y:List of 3
##   ..$ : NULL
##   ..$ : chr "b"
##   ..$ : chr "c"
##   ..- attr(*, "class")= chr "AsIs"

bb <- aa |> mutate(across(where(is.list), q_NULL_to_NA))
str(bb)
## 'data.frame':    3 obs. of  2 variables:
##  $ x: num  4 NA 6
##  $ y: chr  NA "b" "c"

  q_NULL_to_NA <- function(x) {
    # Convert DataFrame list columns containing NULL or NaN to vector with NA
    # Ex: mutate(aa, across(where(is.list), q_NULL_to_NA))
    # Unlike is.na(), is.null() is not vectorised & is.nan() is not for list
    x[sapply(x, \(y) is.null(y) || is.nan(y))] <- NA
    x <- unlist(x)
    return(x)
  }

Python

pp = pd.DataFrame({'x': (None, 1, 2), 'y': ('a', None, np.nan)}) 
pp.info()
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 3 entries, 0 to 2
## Data columns (total 2 columns):
##  #   Column  Non-Null Count  Dtype  
## ---  ------  --------------  -----  
##  0   x       2 non-null      float64
##  1   y       1 non-null      object 
## dtypes: float64(1), object(1)
## memory usage: 180.0+ bytes
pp
##      x     y
## 0  NaN     a
## 1  1.0  None
## 2  2.0   NaN