Link

Pandas in 30 Minutes

References

Table of contents

  1. Index / MultiIndex
  2. Series
  3. DataFrame
  4. Groupby

Index / MultiIndex

## construct

index = pd.Index(np.arange(3)) # Int64Index([0, 1, 2], dtype='int64')
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
                                   ('two', 'a'), ('two', 'b')])

## member

index.name
index.is_unique()

## manipulation

Series

## construct

series = pd.Series([4, 7, -5, 3])
series = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

## member

series.values
series.index    # RangeIndex(start=0, stop=4, step=1)

## manipulation

series.sort_index()
series.sort_values()
series.value_counts(nomalize=True) # relative frequencies
series.value_counts(ascending=True)

### set & get

series[['c', 'a', 'd']]
series[series > 0]
series1.combine_first(series2)  # update #1 with #2

## mapping

np.exp(series)
series.map(lambda x: x * 2) 

## merge

series1.append(series2)

DataFrame

## construct
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data, 
                     columns=['year', 'state', 'pop'], 
                     index=["one", "two", "three", "four", "five", "six"])

## member

frame.index         # return Index
frame.columns       # return Index for columns

## inspection

frame.head()
frame.tail(2)    # the last 2 rows
frame.describe()

## manipulation

frame.reindex(['a', 'b', 'c', 'd', 'e'])
frame.reset_index(drop=False)
frame.T             # pivot
frame.values        # to_numpy()
frame.to_numpy()
frame.sort_index()
frame.sort_index(axis=1, ascending=False)
frame.sort_values(by=['a', 'b'])
frame.rank()
frame.rank(ascending=False, method='max')
frame.rank(axis='columns')
frame.unstack(level=-1) # Returns a DataFrame having a new level of column labels whose inner-most level consists of the pivoted index labels.

### set & get

frame['debt'] = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame[lambda df: df["debt"].isnull()] = 5

frame.loc['three']  # Row in Series
frame.iloc[2]       # Row in Series
frame.at['three', 'pop']       # can be substituted by loc()
frame.loc['three', ['state', 'pop']] # Row x Column
frame.iloc[2, [3, 0, 1]]             # Row x Column
frame.iloc[[1, 2], [3, 0, 1]]        # Row x Column 

frame[frame['three'] > 5]
frame[lambda df: df['three'] > 5]
frame[frame < 5] = 0
frame['two'] = 0
frame.iloc[:, :3][data.three > 5]

### delete

del frame['eastern']    # delete column (impure)
frame.drop(['two', 'four'], axis='columns')  # delete columns
frame.drop('a')           # delete row
frame.drop(['d', 'c'])    # delete rows

## mapping

# same result but different approches with apply()
frame.applymap(lambda x: x * 2) 

## aggragetion on axi

frame.apply(lambda x: x.max() - x.min(), axis='columns')
frame.sum()
frame.mean(axis='columns', skipna=False)
def f(x):
  return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)
frame["debt"].apply(lambda x: x * 2) # x is a cloumn

Groupby

groupby = group + mapping + concatenation

grouped = df.groupby(?):

  • an array indicating how to group. Its element can be
    • an array whose size is equal to the size of the index
    • a Series that maps the index to new elements.
    • a function that will be applied to the index.
    • a string s. sugar for self[s]
  • axis 0 or 1 (rows or columns)

operation grouped.?:

  • mean, sum, prod, median, std, var
  • count (not NaN), size
  • first (not NaN), last
## iterablility

for name, group in df.groupby('key1'):
for (k1, k2), group in df.groupby(['key1', 'key2']):
dict(list(df.groupby('key1')))

## operation

### agg

peak_to_peak = lambda a: a.max() - a.min()
grouped.agg(['mean', 'std', peak_to_peak])
grouped['tip_pct', 'total_bill'].agg(['count', 'mean', 'max'])

### apply

def top(df, n=5, column='tip_pct'):
     return df.sort_values(by=column)[-n:]
    
f = lambda x: x.describe()
grouped.apply(f)