4 Data Frame
4.1 Basics
Indexing starts from 1 in R and from 0 in Python.
R provides data.frame()
for tabular data structure. tibble & data.table are packages which extends its capabilities. Python module pandas provide similar capabilities
R data.frame()
is a list()
of variables of the same number of rows. It is a matrix()
like structure whose columns may be of differing types. Similarly, Python Pandas DataFrame() is a 2-dimensional data structure that can store data of different types in columns.
- Create R
data.frame()
,tibble::tibble()
and Python DataFrame()
R
nn <- 4L #Number of Rows
# R Data Frame: integer, double, character, logical, factor
df_r <- data.frame(
INT = 1:nn, NUM = seq(1, nn, 1), CHR = letters[1:nn], LGL = (1:nn %% 2) == 0,
FCT = factor(rep(c('No', 'Yes'), length.out = nn)))
# Tibble
tbl <- tibble::tibble(
INT = 1:nn, NUM = seq(1, nn, 1), CHR = letters[1:nn], LGL = (1:nn %% 2) == 0,
FCT = factor(rep(c('No', 'Yes'), length.out = nn)))
stopifnot(all(identical(df_r, as.data.frame(tbl)),
identical(tbl, tibble::as_tibble(df_r))))
Python
R
aa <- df_r
if(FALSE) print(aa) #Data Frame prints ALL Rows (Avoid)
if(FALSE) print(tbl, n = 2) #Tibble can take number of rows to print
stopifnot(identical(dplyr::slice(aa, 1:2), head(aa, 2)))
head(aa, 2) #Subset by Head
## INT NUM CHR LGL FCT
## 1 1 1 a FALSE No
## 2 2 2 b TRUE Yes
tail(aa, 2) #Subset by Tail
## INT NUM CHR LGL FCT
## 3 3 3 c FALSE No
## 4 4 4 d TRUE Yes
Python
- About DataFrame:
R
Python
R
aa <- df_r
str(aa) #Structure
## 'data.frame': 4 obs. of 5 variables:
## $ INT: int 1 2 3 4
## $ NUM: num 1 2 3 4
## $ CHR: chr "a" "b" "c" "d"
## $ LGL: logi FALSE TRUE FALSE TRUE
## $ FCT: Factor w/ 2 levels "No","Yes": 1 2 1 2
summary(aa) #Summary
## INT NUM CHR LGL FCT
## Min. :1.00 Min. :1.00 Length:4 Mode :logical No :2
## 1st Qu.:1.75 1st Qu.:1.75 Class :character FALSE:2 Yes:2
## Median :2.50 Median :2.50 Mode :character TRUE :2
## Mean :2.50 Mean :2.50
## 3rd Qu.:3.25 3rd Qu.:3.25
## Max. :4.00 Max. :4.00
Python
pp = df_y.copy()
list(pp.describe().index) #(Default) Summary of Num only
## ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
[x for x in pp.describe(include = 'all').index if x not in pp.describe().index]
## ['unique', 'top', 'freq']
pp.describe(include = 'all').loc[['count', 'max', 'unique']]
## INT NUM CHR LGL FCT
## count 4.0 4.0 4 4 4
## max 4.0 4.0 NaN NaN NaN
## unique NaN NaN 4 2 2
pp.dtypes #data type of each column
## INT int64
## NUM float64
## CHR object
## LGL bool
## FCT category
## dtype: object
pp.info(memory_usage = False) #Structure
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 4 entries, 0 to 3
## Data columns (total 5 columns):
## # Column Non-Null Count Dtype
## --- ------ -------------- -----
## 0 INT 4 non-null int64
## 1 NUM 4 non-null float64
## 2 CHR 4 non-null object
## 3 LGL 4 non-null bool
## 4 FCT 4 non-null category
## dtypes: bool(1), category(1), float64(1), int64(1), object(1)
Python
- Select Columns:
- R:
[]
(base::Extract
),dplyr::select()
,data.frame()
,with()
,subset()
(Avoid) - Python: [], drop(), filter()
- R:
R
aa <- df_r
names(aa)
## [1] "INT" "NUM" "CHR" "LGL" "FCT"
bb <- dplyr::select(aa, 2:3) #Select by Position
dd <- select(aa, NUM, CHR) #Select by Name
ee <- select(aa, -c(INT, LGL, FCT)) #Drop Columns
ff <- data.frame('NUM' = aa$NUM, 'CHR' = aa$CHR)
# with() can be used to create an environment using data
gg <- with(aa, data.frame(NUM, CHR))
# [] is used for subsetting but note that 1-column subset is vector by default
hh <- aa[ , c('NUM', 'CHR'), drop = FALSE]
ii <- subset(aa, select = c(NUM, CHR), drop = FALSE) #Avoid
stopifnot(all(sapply(list(dd, ee, ff, gg, hh, ii), identical, bb)))
str(bb)
## 'data.frame': 4 obs. of 2 variables:
## $ NUM: num 1 2 3 4
## $ CHR: chr "a" "b" "c" "d"
Python
pp = df_y.copy()
list(pp.columns)
## ['INT', 'NUM', 'CHR', 'LGL', 'FCT']
qq = pp[['NUM', 'CHR']].copy() #Use List of Names
ss = pp.drop(columns = ['INT', 'LGL', 'FCT']).copy() #Drop Columns
tt = pp.drop(['INT', 'LGL', 'FCT'], axis = 1).copy() #0 Rows, 1 Columns
uu = pp.filter(['NUM', 'CHR']).copy()
assert(qq.equals(ss) and qq.equals(tt) and qq.equals(uu))
qq
## NUM CHR
## 0 1.0 a
## 1 2.0 b
## 2 3.0 c
## 3 4.0 d
- Rename:
- R:
names()
,dplyr::rename()
- Python: rename(), columns
- R:
R
Python
pp = df_y.copy()
list(pp.columns)
## ['INT', 'NUM', 'CHR', 'LGL', 'FCT']
pp.rename(columns = {'INT': 'A', pp.columns[1]: 'B'}, inplace = True) #Old: New
pp.rename(columns = dict(zip(pp.columns[[3]], ['D'])),inplace = True) #Old, New
pp.columns.values[[2, 4]] = ['C', 'E']
list(pp.columns)
## ['A', 'B', 'C', 'D', 'E']
- Sort:
- R:
order()
,dplyr::arrange()
,dplyr::desc()
- Python: index
- R:
R
aa <- df_r
# order() ALWAYS reorders the rownames
bb <- aa[order(aa$CHR, decreasing = TRUE), ]
# arrange() reorders character rownames but reinitialises them from 1 if integer
dd <- dplyr::arrange(aa, dplyr::desc(CHR))
row.names(dd) <- 4:1
stopifnot(identical(bb, dd))
dd
## INT NUM CHR LGL FCT
## 4 4 4 d TRUE Yes
## 3 3 3 c FALSE No
## 2 2 2 b TRUE Yes
## 1 1 1 a FALSE No
4.2 RowNames - Index
- R
row.names()
are called Index in Python and can be set by set_index() - R
data.frame()
hasrow.names()
but tibble heavily discourage their usage - Duplicated row indices are allowed in Python but not in R
R
aa <- data.frame(x = 1:2)
row.names(aa)
## [1] "1" "2"
row.names(aa) <- letters[1:2] #rownames
stopifnot(tibble::has_rownames(aa))
row.names(aa)
## [1] "a" "b"
4.3 Copy
R
george <- fred <- data.frame(x = 11:13) #R Copy-on-Modify
# Before modification both bind to the same address (unlike Python)
stopifnot(identical(obj_addr(fred), obj_addr(george)))
aa <- obj_addr(fred) #Address before modification
fred[2, 'x'] <- NA #Modify
fred$x
## [1] 11 NA 13
stopifnot(obj_addr(fred) != aa) #Bind to a different address (unlike Python)
# No change in non-modified object (george) address or value (same as Python)
stopifnot(obj_addr(george) == aa)
Python
fred = pd.DataFrame({'x': [1, 2, 3]})
george = fred.copy() #Deepcopy by default
percy = copy.deepcopy(fred) #Deepcopy
# Deepcopy bind to different address even before modification (unlike R)
assert(id(fred) != id(george) != id(percy))
pp = id(fred) #Address before modification
fred.at[1, 'x'] = None #Modify
list(fred['x'])
## [1.0, nan, 3.0]
assert(id(fred) == pp) #No change in address (unlike R)
# No change in non-modified objects (george, percy) address or value (same as R)
assert(george.equals(percy) and not george.equals(fred))
4.4 Modify
- R
[]
(base::Extract
) - Python:
- at can only access a single value at a time (faster). It tries to maintain the datatype (fails sometimes). If column number is used by mistake, it creates a New Column (Avoid)
- loc can select multiple rows and/or columns (slower). It does not maintain the datatype and modifies the type silently.
- iat, iloc are indices variants of the above
R
aa <- df_r
aa[1, 2] <- NA
aa[2, 'CHR'] <- NA
aa
## INT NUM CHR LGL FCT
## 1 1 NA a FALSE No
## 2 2 2 <NA> TRUE Yes
## 3 3 3 c FALSE No
## 4 4 4 d TRUE Yes
4.5 Merge, Join
- A
mutating join
allows you to combine variables from two tables. (Advanced R - Hadley)- These are
inner join
,left outer join
,right outer join
,full outer join
,cross join
- These are
- An
inner join
matches pairs of observations whenever theirkeys
are equal. Thus, unmatched rows are not included in the result.- This means that generally inner joins are usually not appropriate for use in analysis because it is too easy to lose observations.
- An
outer join
keeps observations that appear in at least one of the tables.- A
left join
keeps all observations in x. - A
right join
keeps all observations in y. - A
full join
keeps all observations in x and y. - A
cross join
returns the Cartesian product of rows from both tables.
- A
- See Table 4.1 for Joins of R & Python
- R:
merge()
,dplyr::inner_join()
,dplyr::left_join()
,dplyr::right_join()
,dplyr::full_join()
- Python: merge() (SO)
R
aa <- tibble(ID = c(1, 2, 3), A = c('a1', 'a2', 'a3'))
bb <- tibble(ID = c(1, 2, 4), B = c('b1', 'b2', 'b4'))
# Inner Join
ab_j_inner <- dplyr::inner_join(aa, bb, by = 'ID')
ab_m_inner <- merge(aa, bb, by = 'ID')
# Left Outer Join
ab_j_left <- dplyr::left_join(aa, bb, by = 'ID')
ab_m_left <- merge(aa, bb, by = 'ID', all.x = TRUE)
# Right Outer Join
ab_j_right <- dplyr::right_join(aa, bb, by = 'ID')
ab_m_right <- merge(aa, bb, by = 'ID', all.y = TRUE)
# Full Outer Join
ab_j_full <- dplyr::full_join(aa, bb, by = 'ID')
ab_m_full <- merge(aa, bb, by = 'ID', all = TRUE)
# Cross Join
ab_j_cross <- full_join(aa, bb, by = character(), suffix = c('_x', '_y'))
ab_m_cross <- merge(aa, bb, by=NULL, suffixes = c('_x', '_y')) |> arrange(ID_x)
Python
pp = pd.DataFrame({'ID': (1, 2, 3), 'A': ('a1', 'a2', 'a3')})
qq = pd.DataFrame({'ID': (1, 2, 4), 'B': ('b1', 'b2', 'b4')})
pq_inner = pp.merge(qq, on = 'ID', how = 'inner') #Inner Join
pq_left = pp.merge(qq, on = 'ID', how = 'left') #Left Outer Join
pq_right = pp.merge(qq, on = 'ID', how = 'right') #Right Outer Join
pq_full = pp.merge(qq, on = 'ID', how = 'outer') #Full Outer Join
pq_cross = pp.merge(qq, how = 'cross') #Cross Join
|
|
|
|
|
- Filtering joins match observations in the same way as mutating joins, but affect the observations, not the variables.
-
semi_join(x, y)
keeps all observations in x that have a match in y. -
anti_join(x, y)
drops all observations in x that have a match in y.
-
R
ab_j_semi <- semi_join(aa, bb, by = 'ID') # Semi Join
ab_j_anti <- anti_join(aa, bb, by = 'ID') # Anti Join
|
|
4.6 Sets
- These operations work with a complete row, comparing the values of every variable.
- Thus, these expect both tables /df to have the same variables (columns), and treat the observations (rows) like sets.
- Inner Join vs. Intersect
- The
INNER JOIN
will return duplicates, ifid
is duplicated in either table.INTERSECT
removes duplicates. - The
INNER JOIN
will never returnNULL
, butINTERSECT
will returnNULL
.
- The
- Inner Join vs. semi-join
- With a
semi-join
, each record in the first set is returned only once, regardless of how many matches there are in the second set.
- With a
R
|
|
|
|
R
# Two 2 in First & Three 3 in Second
aa <- tibble(ID = c(1, 2, 2, 3, 4),
A = c('a1', 'a21', 'a22', 'a31', 'a4'))
bb <- tibble(ID = c(1, 2, 3, 3, 3, 5),
A = c('a1', 'a21', 'a31', 'a32', 'a33', 'a5'))
#
ii_inn <- inner_join(aa, bb, by = 'ID')
jj_its <- intersect(aa, bb)
kk_sem <- semi_join(aa, bb, by = 'ID')
#
str(ii_inn, vec.len = nrow(ii_inn)) #Duplicate IDs of Both
## tibble [6 × 3] (S3: tbl_df/tbl/data.frame)
## $ ID : num [1:6] 1 2 2 3 3 3
## $ A.x: chr [1:6] "a1" "a21" "a22" "a31" "a31" "a31"
## $ A.y: chr [1:6] "a1" "a21" "a21" "a31" "a32" "a33"
str(jj_its, vec.len = nrow(jj_its)) #No Duplicate IDs of Either
## tibble [3 × 2] (S3: tbl_df/tbl/data.frame)
## $ ID: num [1:3] 1 2 3
## $ A : chr [1:3] "a1" "a21" "a31"
str(kk_sem, vec.len = nrow(kk_sem)) #Duplicates of First found in Second
## tibble [4 × 2] (S3: tbl_df/tbl/data.frame)
## $ ID: num [1:4] 1 2 2 3
## $ A : chr [1:4] "a1" "a21" "a22" "a31"
setdiff(kk_sem, jj_its) #Extra in Semi Join over Intersection
## # A tibble: 1 × 2
## ID A
## <dbl> <chr>
## 1 2 a22
4.7 Missing Values
- R:
NA
represents the missing values. - Python Pandas DataFrame() treats
None
innumeric
columns asnp.nan
. It keepsNone
ornp.nan
inobject
columns as it is, however, internally these all are treated as missing values - Note: reticulate currently converts
object
columns withNone
ornp.nan
tolist
containingNULL
orNaN
. It also convertsnp.nan
innumeric
column toNaN
. All of these should beNA
R
# NULL cannot be concatenated
c(NULL, NULL)
## NULL
list(NULL, NULL)
## [[1]]
## NULL
##
## [[2]]
## NULL
# Only list can contain NULL
aa <- data.frame(x = 1:3, y = I(list(NULL, 'b', 'c')))
aa$x <- list(4, NULL, 6)
str(aa)
## 'data.frame': 3 obs. of 2 variables:
## $ x:List of 3
## ..$ : num 4
## ..$ : NULL
## ..$ : num 6
## $ y:List of 3
## ..$ : NULL
## ..$ : chr "b"
## ..$ : chr "c"
## ..- attr(*, "class")= chr "AsIs"
bb <- aa |> mutate(across(where(is.list), q_NULL_to_NA))
str(bb)
## 'data.frame': 3 obs. of 2 variables:
## $ x: num 4 NA 6
## $ y: chr NA "b" "c"
q_NULL_to_NA <- function(x) {
# Convert DataFrame list columns containing NULL or NaN to vector with NA
# Ex: mutate(aa, across(where(is.list), q_NULL_to_NA))
# Unlike is.na(), is.null() is not vectorised & is.nan() is not for list
x[sapply(x, \(y) is.null(y) || is.nan(y))] <- NA
x <- unlist(x)
return(x)
}
Python
pp = pd.DataFrame({'x': (None, 1, 2), 'y': ('a', None, np.nan)})
pp.info()
## <class 'pandas.core.frame.DataFrame'>
## RangeIndex: 3 entries, 0 to 2
## Data columns (total 2 columns):
## # Column Non-Null Count Dtype
## --- ------ -------------- -----
## 0 x 2 non-null float64
## 1 y 1 non-null object
## dtypes: float64(1), object(1)
## memory usage: 180.0+ bytes
pp
## x y
## 0 NaN a
## 1 1.0 None
## 2 2.0 NaN