In [2]:
import pandas as pd

Load csv data as DataFrame

In [11]:
names_df = pd.read_csv("name.basics_sample_500.tsv", sep="\t")
names_df.head(n=3)
# names_df.sample(n=3)
Out[11]:
nconst primaryName birthYear deathYear primaryProfession knownForTitles
0 nm0000001 Fred Astaire 1899 1987 soundtrack,actor,miscellaneous tt0072308,tt0043044,tt0045537,tt0050419
1 nm0000002 Lauren Bacall 1924 2014 actress,soundtrack tt0117057,tt0040506,tt0037382,tt0038355
2 nm0000003 Brigitte Bardot 1934 \N actress,soundtrack,producer tt0057345,tt0054452,tt0059956,tt0049189

header param

Row number(s) to use as the column names, and the start of the data, or None

In [17]:
# yes, we did not want that, in our data:
names_df = pd.read_csv("name.basics_sample_500.tsv", sep="\t", header=1)
names_df.head(n=3)
Out[17]:
nm0000001 Fred Astaire 1899 1987 soundtrack,actor,miscellaneous tt0072308,tt0043044,tt0045537,tt0050419
0 nm0000002 Lauren Bacall 1924 2014 actress,soundtrack tt0117057,tt0040506,tt0037382,tt0038355
1 nm0000003 Brigitte Bardot 1934 \N actress,soundtrack,producer tt0057345,tt0054452,tt0059956,tt0049189
2 nm0000004 John Belushi 1949 1982 actor,writer,soundtrack tt0078723,tt0072562,tt0077975,tt0080455

names param

List of column names to use. If file contains no header row, then you should explicitly pass header=None

In [18]:
names_df = pd.read_csv("name.basics_sample_500.tsv", sep="\t",
                       header=0,
                      names=["nID", "ArtistName", "birth", "death", "Profession", "mouvies"])
names_df.head(n=3)
Out[18]:
nID ArtistName birth death Profession mouvies
0 nm0000001 Fred Astaire 1899 1987 soundtrack,actor,miscellaneous tt0072308,tt0043044,tt0045537,tt0050419
1 nm0000002 Lauren Bacall 1924 2014 actress,soundtrack tt0117057,tt0040506,tt0037382,tt0038355
2 nm0000003 Brigitte Bardot 1934 \N actress,soundtrack,producer tt0057345,tt0054452,tt0059956,tt0049189

Loading big files

nrows param

Number of rows of file to read. Useful for reading pieces of large files.

Other useful parameters are chunksize and iterator

In [21]:
names_10_df = pd.read_csv("name.basics_sample_500.tsv", sep="\t", nrows=10)
names_10_df
Out[21]:
nconst primaryName birthYear deathYear primaryProfession knownForTitles
0 nm0000001 Fred Astaire 1899 1987 soundtrack,actor,miscellaneous tt0072308,tt0043044,tt0045537,tt0050419
1 nm0000002 Lauren Bacall 1924 2014 actress,soundtrack tt0117057,tt0040506,tt0037382,tt0038355
2 nm0000003 Brigitte Bardot 1934 \N actress,soundtrack,producer tt0057345,tt0054452,tt0059956,tt0049189
3 nm0000004 John Belushi 1949 1982 actor,writer,soundtrack tt0078723,tt0072562,tt0077975,tt0080455
4 nm0000005 Ingmar Bergman 1918 2007 writer,director,actor tt0083922,tt0050986,tt0060827,tt0050976
5 nm0000006 Ingrid Bergman 1915 1982 actress,soundtrack,producer tt0034583,tt0038109,tt0038787,tt0077711
6 nm0000007 Humphrey Bogart 1899 1957 actor,soundtrack,producer tt0043265,tt0037382,tt0034583,tt0033870
7 nm0000008 Marlon Brando 1924 2004 actor,soundtrack,director tt0047296,tt0078788,tt0070849,tt0068646
8 nm0000009 Richard Burton 1925 1984 actor,producer,soundtrack tt0061184,tt0087803,tt0057877,tt0059749
9 nm0000010 James Cagney 1899 1986 actor,soundtrack,director tt0031867,tt0035575,tt0055256,tt0029870

Clean data

na_values

Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values

In [46]:
names_10_df = pd.read_csv("name.basics_sample_500.tsv", sep="\t", nrows=10, na_values=["\\N"])
names_10_df
Out[46]:
nconst primaryName birthYear deathYear primaryProfession knownForTitles
0 nm0000001 Fred Astaire 1899 1987.0 soundtrack,actor,miscellaneous tt0072308,tt0043044,tt0045537,tt0050419
1 nm0000002 Lauren Bacall 1924 2014.0 actress,soundtrack tt0117057,tt0040506,tt0037382,tt0038355
2 nm0000003 Brigitte Bardot 1934 NaN actress,soundtrack,producer tt0057345,tt0054452,tt0059956,tt0049189
3 nm0000004 John Belushi 1949 1982.0 actor,writer,soundtrack tt0078723,tt0072562,tt0077975,tt0080455
4 nm0000005 Ingmar Bergman 1918 2007.0 writer,director,actor tt0083922,tt0050986,tt0060827,tt0050976
5 nm0000006 Ingrid Bergman 1915 1982.0 actress,soundtrack,producer tt0034583,tt0038109,tt0038787,tt0077711
6 nm0000007 Humphrey Bogart 1899 1957.0 actor,soundtrack,producer tt0043265,tt0037382,tt0034583,tt0033870
7 nm0000008 Marlon Brando 1924 2004.0 actor,soundtrack,director tt0047296,tt0078788,tt0070849,tt0068646
8 nm0000009 Richard Burton 1925 1984.0 actor,producer,soundtrack tt0061184,tt0087803,tt0057877,tt0059749
9 nm0000010 James Cagney 1899 1986.0 actor,soundtrack,director tt0031867,tt0035575,tt0055256,tt0029870

Renaming columns

In [49]:
# note that rename create new DF
names_10_df.rename(columns={"primaryName": "ActorName"})
Out[49]:
nconst ActorName birthYear deathYear primaryProfession knownForTitles
0 nm0000001 Fred Astaire 1899 1987.0 soundtrack,actor,miscellaneous tt0072308,tt0043044,tt0045537,tt0050419
1 nm0000002 Lauren Bacall 1924 2014.0 actress,soundtrack tt0117057,tt0040506,tt0037382,tt0038355
2 nm0000003 Brigitte Bardot 1934 NaN actress,soundtrack,producer tt0057345,tt0054452,tt0059956,tt0049189
3 nm0000004 John Belushi 1949 1982.0 actor,writer,soundtrack tt0078723,tt0072562,tt0077975,tt0080455
4 nm0000005 Ingmar Bergman 1918 2007.0 writer,director,actor tt0083922,tt0050986,tt0060827,tt0050976
5 nm0000006 Ingrid Bergman 1915 1982.0 actress,soundtrack,producer tt0034583,tt0038109,tt0038787,tt0077711
6 nm0000007 Humphrey Bogart 1899 1957.0 actor,soundtrack,producer tt0043265,tt0037382,tt0034583,tt0033870
7 nm0000008 Marlon Brando 1924 2004.0 actor,soundtrack,director tt0047296,tt0078788,tt0070849,tt0068646
8 nm0000009 Richard Burton 1925 1984.0 actor,producer,soundtrack tt0061184,tt0087803,tt0057877,tt0059749
9 nm0000010 James Cagney 1899 1986.0 actor,soundtrack,director tt0031867,tt0035575,tt0055256,tt0029870