{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Load csv data as DataFrame" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nconstprimaryNamebirthYeardeathYearprimaryProfessionknownForTitles
0nm0000001Fred Astaire18991987soundtrack,actor,miscellaneoustt0072308,tt0043044,tt0045537,tt0050419
1nm0000002Lauren Bacall19242014actress,soundtracktt0117057,tt0040506,tt0037382,tt0038355
2nm0000003Brigitte Bardot1934\\Nactress,soundtrack,producertt0057345,tt0054452,tt0059956,tt0049189
\n", "
" ], "text/plain": [ " nconst primaryName birthYear deathYear \\\n", "0 nm0000001 Fred Astaire 1899 1987 \n", "1 nm0000002 Lauren Bacall 1924 2014 \n", "2 nm0000003 Brigitte Bardot 1934 \\N \n", "\n", " primaryProfession knownForTitles \n", "0 soundtrack,actor,miscellaneous tt0072308,tt0043044,tt0045537,tt0050419 \n", "1 actress,soundtrack tt0117057,tt0040506,tt0037382,tt0038355 \n", "2 actress,soundtrack,producer tt0057345,tt0054452,tt0059956,tt0049189 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "names_df = pd.read_csv(\"name.basics_sample_500.tsv\", sep=\"\\t\")\n", "names_df.head(n=3)\n", "# names_df.sample(n=3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### header param" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Row number(s) to use as the column names, and the start of the data, or None" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nm0000001Fred Astaire18991987soundtrack,actor,miscellaneoustt0072308,tt0043044,tt0045537,tt0050419
0nm0000002Lauren Bacall19242014actress,soundtracktt0117057,tt0040506,tt0037382,tt0038355
1nm0000003Brigitte Bardot1934\\Nactress,soundtrack,producertt0057345,tt0054452,tt0059956,tt0049189
2nm0000004John Belushi19491982actor,writer,soundtracktt0078723,tt0072562,tt0077975,tt0080455
\n", "
" ], "text/plain": [ " nm0000001 Fred Astaire 1899 1987 soundtrack,actor,miscellaneous \\\n", "0 nm0000002 Lauren Bacall 1924 2014 actress,soundtrack \n", "1 nm0000003 Brigitte Bardot 1934 \\N actress,soundtrack,producer \n", "2 nm0000004 John Belushi 1949 1982 actor,writer,soundtrack \n", "\n", " tt0072308,tt0043044,tt0045537,tt0050419 \n", "0 tt0117057,tt0040506,tt0037382,tt0038355 \n", "1 tt0057345,tt0054452,tt0059956,tt0049189 \n", "2 tt0078723,tt0072562,tt0077975,tt0080455 " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# yes, we did not want that, in our data:\n", "names_df = pd.read_csv(\"name.basics_sample_500.tsv\", sep=\"\\t\", header=1)\n", "names_df.head(n=3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### names param" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "List of column names to use. If file contains no header row, then you should explicitly pass header=None" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nIDArtistNamebirthdeathProfessionmouvies
0nm0000001Fred Astaire18991987soundtrack,actor,miscellaneoustt0072308,tt0043044,tt0045537,tt0050419
1nm0000002Lauren Bacall19242014actress,soundtracktt0117057,tt0040506,tt0037382,tt0038355
2nm0000003Brigitte Bardot1934\\Nactress,soundtrack,producertt0057345,tt0054452,tt0059956,tt0049189
\n", "
" ], "text/plain": [ " nID ArtistName birth death Profession \\\n", "0 nm0000001 Fred Astaire 1899 1987 soundtrack,actor,miscellaneous \n", "1 nm0000002 Lauren Bacall 1924 2014 actress,soundtrack \n", "2 nm0000003 Brigitte Bardot 1934 \\N actress,soundtrack,producer \n", "\n", " mouvies \n", "0 tt0072308,tt0043044,tt0045537,tt0050419 \n", "1 tt0117057,tt0040506,tt0037382,tt0038355 \n", "2 tt0057345,tt0054452,tt0059956,tt0049189 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "names_df = pd.read_csv(\"name.basics_sample_500.tsv\", sep=\"\\t\",\n", " header=0,\n", " names=[\"nID\", \"ArtistName\", \"birth\", \"death\", \"Profession\", \"mouvies\"])\n", "names_df.head(n=3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading big files " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### nrows param" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Number of rows of file to read. Useful for reading pieces of large files.\n", "\n", "Other useful parameters are chunksize and iterator" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nconstprimaryNamebirthYeardeathYearprimaryProfessionknownForTitles
0nm0000001Fred Astaire18991987soundtrack,actor,miscellaneoustt0072308,tt0043044,tt0045537,tt0050419
1nm0000002Lauren Bacall19242014actress,soundtracktt0117057,tt0040506,tt0037382,tt0038355
2nm0000003Brigitte Bardot1934\\Nactress,soundtrack,producertt0057345,tt0054452,tt0059956,tt0049189
3nm0000004John Belushi19491982actor,writer,soundtracktt0078723,tt0072562,tt0077975,tt0080455
4nm0000005Ingmar Bergman19182007writer,director,actortt0083922,tt0050986,tt0060827,tt0050976
5nm0000006Ingrid Bergman19151982actress,soundtrack,producertt0034583,tt0038109,tt0038787,tt0077711
6nm0000007Humphrey Bogart18991957actor,soundtrack,producertt0043265,tt0037382,tt0034583,tt0033870
7nm0000008Marlon Brando19242004actor,soundtrack,directortt0047296,tt0078788,tt0070849,tt0068646
8nm0000009Richard Burton19251984actor,producer,soundtracktt0061184,tt0087803,tt0057877,tt0059749
9nm0000010James Cagney18991986actor,soundtrack,directortt0031867,tt0035575,tt0055256,tt0029870
\n", "
" ], "text/plain": [ " nconst primaryName birthYear deathYear \\\n", "0 nm0000001 Fred Astaire 1899 1987 \n", "1 nm0000002 Lauren Bacall 1924 2014 \n", "2 nm0000003 Brigitte Bardot 1934 \\N \n", "3 nm0000004 John Belushi 1949 1982 \n", "4 nm0000005 Ingmar Bergman 1918 2007 \n", "5 nm0000006 Ingrid Bergman 1915 1982 \n", "6 nm0000007 Humphrey Bogart 1899 1957 \n", "7 nm0000008 Marlon Brando 1924 2004 \n", "8 nm0000009 Richard Burton 1925 1984 \n", "9 nm0000010 James Cagney 1899 1986 \n", "\n", " primaryProfession knownForTitles \n", "0 soundtrack,actor,miscellaneous tt0072308,tt0043044,tt0045537,tt0050419 \n", "1 actress,soundtrack tt0117057,tt0040506,tt0037382,tt0038355 \n", "2 actress,soundtrack,producer tt0057345,tt0054452,tt0059956,tt0049189 \n", "3 actor,writer,soundtrack tt0078723,tt0072562,tt0077975,tt0080455 \n", "4 writer,director,actor tt0083922,tt0050986,tt0060827,tt0050976 \n", "5 actress,soundtrack,producer tt0034583,tt0038109,tt0038787,tt0077711 \n", "6 actor,soundtrack,producer tt0043265,tt0037382,tt0034583,tt0033870 \n", "7 actor,soundtrack,director tt0047296,tt0078788,tt0070849,tt0068646 \n", "8 actor,producer,soundtrack tt0061184,tt0087803,tt0057877,tt0059749 \n", "9 actor,soundtrack,director tt0031867,tt0035575,tt0055256,tt0029870 " ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "names_10_df = pd.read_csv(\"name.basics_sample_500.tsv\", sep=\"\\t\", nrows=10)\n", "names_10_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Clean data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### na_values" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Additional strings to recognize as NA/NaN. If dict passed, specific per-column NA values" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nconstprimaryNamebirthYeardeathYearprimaryProfessionknownForTitles
0nm0000001Fred Astaire18991987.0soundtrack,actor,miscellaneoustt0072308,tt0043044,tt0045537,tt0050419
1nm0000002Lauren Bacall19242014.0actress,soundtracktt0117057,tt0040506,tt0037382,tt0038355
2nm0000003Brigitte Bardot1934NaNactress,soundtrack,producertt0057345,tt0054452,tt0059956,tt0049189
3nm0000004John Belushi19491982.0actor,writer,soundtracktt0078723,tt0072562,tt0077975,tt0080455
4nm0000005Ingmar Bergman19182007.0writer,director,actortt0083922,tt0050986,tt0060827,tt0050976
5nm0000006Ingrid Bergman19151982.0actress,soundtrack,producertt0034583,tt0038109,tt0038787,tt0077711
6nm0000007Humphrey Bogart18991957.0actor,soundtrack,producertt0043265,tt0037382,tt0034583,tt0033870
7nm0000008Marlon Brando19242004.0actor,soundtrack,directortt0047296,tt0078788,tt0070849,tt0068646
8nm0000009Richard Burton19251984.0actor,producer,soundtracktt0061184,tt0087803,tt0057877,tt0059749
9nm0000010James Cagney18991986.0actor,soundtrack,directortt0031867,tt0035575,tt0055256,tt0029870
\n", "
" ], "text/plain": [ " nconst primaryName birthYear deathYear \\\n", "0 nm0000001 Fred Astaire 1899 1987.0 \n", "1 nm0000002 Lauren Bacall 1924 2014.0 \n", "2 nm0000003 Brigitte Bardot 1934 NaN \n", "3 nm0000004 John Belushi 1949 1982.0 \n", "4 nm0000005 Ingmar Bergman 1918 2007.0 \n", "5 nm0000006 Ingrid Bergman 1915 1982.0 \n", "6 nm0000007 Humphrey Bogart 1899 1957.0 \n", "7 nm0000008 Marlon Brando 1924 2004.0 \n", "8 nm0000009 Richard Burton 1925 1984.0 \n", "9 nm0000010 James Cagney 1899 1986.0 \n", "\n", " primaryProfession knownForTitles \n", "0 soundtrack,actor,miscellaneous tt0072308,tt0043044,tt0045537,tt0050419 \n", "1 actress,soundtrack tt0117057,tt0040506,tt0037382,tt0038355 \n", "2 actress,soundtrack,producer tt0057345,tt0054452,tt0059956,tt0049189 \n", "3 actor,writer,soundtrack tt0078723,tt0072562,tt0077975,tt0080455 \n", "4 writer,director,actor tt0083922,tt0050986,tt0060827,tt0050976 \n", "5 actress,soundtrack,producer tt0034583,tt0038109,tt0038787,tt0077711 \n", "6 actor,soundtrack,producer tt0043265,tt0037382,tt0034583,tt0033870 \n", "7 actor,soundtrack,director tt0047296,tt0078788,tt0070849,tt0068646 \n", "8 actor,producer,soundtrack tt0061184,tt0087803,tt0057877,tt0059749 \n", "9 actor,soundtrack,director tt0031867,tt0035575,tt0055256,tt0029870 " ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "names_10_df = pd.read_csv(\"name.basics_sample_500.tsv\", sep=\"\\t\", nrows=10, na_values=[\"\\\\N\"])\n", "names_10_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Renaming columns" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
nconstActorNamebirthYeardeathYearprimaryProfessionknownForTitles
0nm0000001Fred Astaire18991987.0soundtrack,actor,miscellaneoustt0072308,tt0043044,tt0045537,tt0050419
1nm0000002Lauren Bacall19242014.0actress,soundtracktt0117057,tt0040506,tt0037382,tt0038355
2nm0000003Brigitte Bardot1934NaNactress,soundtrack,producertt0057345,tt0054452,tt0059956,tt0049189
3nm0000004John Belushi19491982.0actor,writer,soundtracktt0078723,tt0072562,tt0077975,tt0080455
4nm0000005Ingmar Bergman19182007.0writer,director,actortt0083922,tt0050986,tt0060827,tt0050976
5nm0000006Ingrid Bergman19151982.0actress,soundtrack,producertt0034583,tt0038109,tt0038787,tt0077711
6nm0000007Humphrey Bogart18991957.0actor,soundtrack,producertt0043265,tt0037382,tt0034583,tt0033870
7nm0000008Marlon Brando19242004.0actor,soundtrack,directortt0047296,tt0078788,tt0070849,tt0068646
8nm0000009Richard Burton19251984.0actor,producer,soundtracktt0061184,tt0087803,tt0057877,tt0059749
9nm0000010James Cagney18991986.0actor,soundtrack,directortt0031867,tt0035575,tt0055256,tt0029870
\n", "
" ], "text/plain": [ " nconst ActorName birthYear deathYear \\\n", "0 nm0000001 Fred Astaire 1899 1987.0 \n", "1 nm0000002 Lauren Bacall 1924 2014.0 \n", "2 nm0000003 Brigitte Bardot 1934 NaN \n", "3 nm0000004 John Belushi 1949 1982.0 \n", "4 nm0000005 Ingmar Bergman 1918 2007.0 \n", "5 nm0000006 Ingrid Bergman 1915 1982.0 \n", "6 nm0000007 Humphrey Bogart 1899 1957.0 \n", "7 nm0000008 Marlon Brando 1924 2004.0 \n", "8 nm0000009 Richard Burton 1925 1984.0 \n", "9 nm0000010 James Cagney 1899 1986.0 \n", "\n", " primaryProfession knownForTitles \n", "0 soundtrack,actor,miscellaneous tt0072308,tt0043044,tt0045537,tt0050419 \n", "1 actress,soundtrack tt0117057,tt0040506,tt0037382,tt0038355 \n", "2 actress,soundtrack,producer tt0057345,tt0054452,tt0059956,tt0049189 \n", "3 actor,writer,soundtrack tt0078723,tt0072562,tt0077975,tt0080455 \n", "4 writer,director,actor tt0083922,tt0050986,tt0060827,tt0050976 \n", "5 actress,soundtrack,producer tt0034583,tt0038109,tt0038787,tt0077711 \n", "6 actor,soundtrack,producer tt0043265,tt0037382,tt0034583,tt0033870 \n", "7 actor,soundtrack,director tt0047296,tt0078788,tt0070849,tt0068646 \n", "8 actor,producer,soundtrack tt0061184,tt0087803,tt0057877,tt0059749 \n", "9 actor,soundtrack,director tt0031867,tt0035575,tt0055256,tt0029870 " ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# note that rename create new DF\n", "names_10_df.rename(columns={\"primaryName\": \"ActorName\"})" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.4" } }, "nbformat": 4, "nbformat_minor": 2 }