How to delete duplicates from a Pandas DataFrame?
0

How to delete duplicates from a Pandas DataFrame?

This recipe helps you delete duplicates from a Pandas DataFrame
In [1]:
## How to delete duplicates from a Pandas DataFrame
def Kickstarter_Example_84():
    print()
    print(format('How to delete duplicates from a Pandas DataFrame','*^82'))

    import warnings
    warnings.filterwarnings("ignore")

    # load libraries
    import pandas as pd

    # Create dataframe with duplicates
    raw_data = {'first_name': ['Jason', 'Jason', 'Jason','Tina', 'Jake', 'Amy'],
                'last_name': ['Miller', 'Miller', 'Miller','Ali', 'Milner', 'Cooze'],
                'age': [42, 42, 1111111, 36, 24, 73],
                'preTestScore': [4, 4, 4, 31, 2, 3],
                'postTestScore': [25, 25, 25, 57, 62, 70]}

    df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age',
                                           'preTestScore', 'postTestScore'])
    print(); print(df)

    # Identify which observations are duplicates
    print(); print(df.duplicated())
    print(); print(df.drop_duplicates(keep='first'))

    # Drop duplicates in the first name column, but take the last obs in the duplicated set
    print(); print(df.drop_duplicates(['first_name'], keep='last'))

Kickstarter_Example_84()
*****************How to delete duplicates from a Pandas DataFrame*****************

  first_name last_name      age  preTestScore  postTestScore
0      Jason    Miller       42             4             25
1      Jason    Miller       42             4             25
2      Jason    Miller  1111111             4             25
3       Tina       Ali       36            31             57
4       Jake    Milner       24             2             62
5        Amy     Cooze       73             3             70

0    False
1     True
2    False
3    False
4    False
5    False
dtype: bool

  first_name last_name      age  preTestScore  postTestScore
0      Jason    Miller       42             4             25
2      Jason    Miller  1111111             4             25
3       Tina       Ali       36            31             57
4       Jake    Milner       24             2             62
5        Amy     Cooze       73             3             70

  first_name last_name      age  preTestScore  postTestScore
2      Jason    Miller  1111111             4             25
3       Tina       Ali       36            31             57
4       Jake    Milner       24             2             62
5        Amy     Cooze       73             3             70