DataFrames Join

In [1]:
import pandas as pd 

Load data - not good for join!!!

In [2]:
devs = pd.read_csv("datasets/developers/developers.csv", sep=";")
devs
Out[2]:
did dname
0 1 Ivan
1 2 Asen
2 3 Maria
3 4 Stoyan
4 5 Aleks
5 6 Svetlin
In [3]:
langs = pd.read_csv("datasets/developers/languages.csv", sep=";")
langs
Out[3]:
did language
0 2 "C++"
1 3 "Python"
2 3 "R"
3 6 "Java"

Load data - good for join!!!

In [4]:
bg_towns = pd.DataFrame(["sofia", "sandanski", "pleven","varna"], columns=["town"])
bg_towns
Out[4]:
town
0 sofia
1 sandanski
2 pleven
3 varna
In [5]:
bg_weather = pd.DataFrame([25, 35, 20 ], columns=["temp"])
bg_weather
Out[5]:
temp
0 25
1 35
2 20

Inner Join with Join()

In [6]:
# on="did" => matches devs.did == langs.index 
# how="inner" => removes NaN value

dev_langs_inner = devs.join(langs,lsuffix="_l",rsuffix="_r", on="did", how="inner")
dev_langs_inner
Out[6]:
did did_l dname did_r language
0 1 1 Ivan 3 "Python"
1 2 2 Asen 3 "R"
2 3 3 Maria 6 "Java"
In [7]:
bg_weather_inner = bg_weather.join(bg_towns, how="inner")
bg_weather_inner
Out[7]:
temp town
0 25 sofia
1 35 sandanski
2 20 pleven

Outer Join with Join()

In [8]:
dev_langs_outer = devs.join(langs,lsuffix="_l",rsuffix="_r", on="did", how="outer")
dev_langs_outer
Out[8]:
did did_l dname did_r language
0 1 1.0 Ivan 3.0 "Python"
1 2 2.0 Asen 3.0 "R"
2 3 3.0 Maria 6.0 "Java"
3 4 4.0 Stoyan NaN NaN
4 5 5.0 Aleks NaN NaN
5 6 6.0 Svetlin NaN NaN
5 0 NaN NaN 2.0 "C++"
In [9]:
bg_weather_outer = bg_weather.join(bg_towns, how="outer")
bg_weather_outer
Out[9]:
temp town
0 25.0 sofia
1 35.0 sandanski
2 20.0 pleven
3 NaN varna