How to preprocess string data within a Pandas DataFrame?
0

How to preprocess string data within a Pandas DataFrame?

This recipe helps you preprocess string data within a Pandas DataFrame
In [1]:
## How to preprocess string data within a Pandas DataFrame
def Kickstarter_Example_74():
    print()
    print(format('How to preprocess string data within a Pandas DataFrame','*^82'))
    import warnings
    warnings.filterwarnings("ignore")

    # load libraries
    import pandas as pd
    # Create a dataframe with a single column of strings
    data = {'stringData': ['Arizona 1 2014-12-23    3242.0',
                           'Iowa 1 2010-02-23       3453.7',
                           'Oregon 0 2014-06-20     2123.0',
                           'Maryland 0 2014-03-14   1123.6',
                           'Florida 1 2013-01-15    2134.0',
                           'Georgia 0 2012-07-14    2345.6']}
    df = pd.DataFrame(data, columns = ['stringData'])
    print(); print(df)

    # Search a column of strings for a pattern
    # Which rows of df['stringData'] contain 'xxxx-xx-xx'?
    print(); print(df['stringData'].str.contains('....-..-..', regex=True))

    # Extract the column of single digits
    # In the column 'stringData', extract single digit in the strings
    df['Boolean'] = df['stringData'].str.extract('(\d)', expand=True)
    print(); print(df['Boolean'])

    # Extract the column of dates
    # In the column 'raw', extract xxxx-xx-xx in the strings
    df['date'] = df['stringData'].str.extract('(....-..-..)', expand=True)
    print(); print(df['date'])

    # Extract the column of thousands
    # In the column 'stringData', extract ####.## in the strings
    df['score'] = df['stringData'].str.extract('(\d\d\d\d\.\d)', expand=True)
    print(); print(df['score'])

    # Extract the column of words
    # In the column 'stringData', extract the word in the strings
    df['state'] = df['stringData'].str.extract('([A-Z]\w{0,})', expand=True)
    print(); print(df['state'])

    # View the final dataframe
    print(); print(df)
Kickstarter_Example_74()
*************How to preprocess string data within a Pandas DataFrame**************

                       stringData
0  Arizona 1 2014-12-23    3242.0
1  Iowa 1 2010-02-23       3453.7
2  Oregon 0 2014-06-20     2123.0
3  Maryland 0 2014-03-14   1123.6
4  Florida 1 2013-01-15    2134.0
5  Georgia 0 2012-07-14    2345.6

0    True
1    True
2    True
3    True
4    True
5    True
Name: stringData, dtype: bool

0    1
1    1
2    0
3    0
4    1
5    0
Name: Boolean, dtype: object

0    2014-12-23
1    2010-02-23
2    2014-06-20
3    2014-03-14
4    2013-01-15
5    2012-07-14
Name: date, dtype: object

0    3242.0
1    3453.7
2    2123.0
3    1123.6
4    2134.0
5    2345.6
Name: score, dtype: object

0     Arizona
1        Iowa
2      Oregon
3    Maryland
4     Florida
5     Georgia
Name: state, dtype: object

                       stringData Boolean        date   score     state
0  Arizona 1 2014-12-23    3242.0       1  2014-12-23  3242.0   Arizona
1  Iowa 1 2010-02-23       3453.7       1  2010-02-23  3453.7      Iowa
2  Oregon 0 2014-06-20     2123.0       0  2014-06-20  2123.0    Oregon
3  Maryland 0 2014-03-14   1123.6       0  2014-03-14  1123.6  Maryland
4  Florida 1 2013-01-15    2134.0       1  2013-01-15  2134.0   Florida
5  Georgia 0 2012-07-14    2345.6       0  2012-07-14  2345.6   Georgia