import numpy as np
a = np.array([1, 2, 3, 4], dtype=float)
a

array([1., 2., 3., 4.])


a = range(1000)
%timeit [ i**2 for i in a]

280 µs ± 8.64 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


b = np.arange(1000)
%timeit b**2

1.02 µs ± 45.7 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


c = np.array([[1,2],[3,4]])
c

array([[1, 2],
       [3, 4]])


c.ndim

2


c.shape

(2, 2)


c = np.arange(27)
c.reshape((3,3,3))

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])


np.zeros((2,2))

array([[0., 0.],
       [0., 0.]])


np.ones((2,1))

array([[1.],
       [1.]])


a = np.arange(27).reshape((3,3,3))
np.ones_like(a)

array([[[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]],

       [[1, 1, 1],
        [1, 1, 1],
        [1, 1, 1]]])


np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])


a = np.arange(10)
a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


a[0]

0


a[-1]

9


a[0:3]

array([0, 1, 2])


a[::2]

array([0, 2, 4, 6, 8])


a = a.reshape(5,2)
a

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])


a[3,1]

7


a[2,:]

array([4, 5])


a = np.arange(9)


b = a.reshape((3,3))


np.shares_memory(a,b)

True


a[3] = -1

b

array([[ 0,  1,  2],
       [-1,  4,  5],
       [ 6,  7,  8]])


np.shares_memory(a,b)

True


b = a.copy()
np.shares_memory(a,b)

False


ene = np.random.exponential(size=10, scale=10.) # 1/scale e^(-ene/scale)
ene

array([26.78949043, 39.75183681,  0.36677236,  3.8809366 ,  0.54143778,
        3.64373788, 44.26306009,  0.63227985, 16.09218254,  5.13612747])


mask = ene > 2
mask

array([ True,  True, False,  True, False,  True,  True, False,  True,
        True])


ene[mask]

array([26.78949043, 39.75183681,  3.8809366 ,  3.64373788, 44.26306009,
       16.09218254,  5.13612747])


ene[ene<2]

array([0.36677236, 0.54143778, 0.63227985])


ene[ene<2] = 0
ene

array([26.78949043, 39.75183681,  0.        ,  3.8809366 ,  0.        ,
        3.64373788, 44.26306009,  0.        , 16.09218254,  5.13612747])


status = np.random.randint(low=0,high=10,size=10)
status

array([3, 1, 3, 2, 1, 2, 8, 5, 7, 4])


status[[0, 3, 5]]

array([3, 2, 2])


status[[0, 3, 5]] = -1
status

array([-1,  1,  3, -1,  1, -1,  8,  5,  7,  4])


a = np.arange(4)
a

array([0, 1, 2, 3])

a+1

array([1, 2, 3, 4])


10**a

array([   1,   10,  100, 1000])


np.sin(a)

array([0.        , 0.84147098, 0.90929743, 0.14112001])


a = np.random.randint(low=0,high=10,size=4)
a

array([2, 8, 7, 7])


np.sum(a)

24


np.max(a), np.min(a)

(8, 2)


np.argmax(a), np.argmin(a)

(1, 0)


np.mean(a), np.median(a), np.std(a)

(6.0, 7.0, 2.345207879911715)


a = a.reshape(2,2)
a

array([[2, 8],
       [7, 7]])


np.sum(a,axis=1)

array([10, 14])


m1 = a>3
m1

array([[False,  True],
       [ True,  True]])


np.all(m1)

False


np.any(m1)

True


%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
X = np.linspace(-np.pi, np.pi, 256, endpoint=True)
C, S = np.cos(X), np.sin(X)
plt.plot(X,C)
plt.plot(X,S)

[<matplotlib.lines.Line2D at 0x7fbd7c35f810>]


plt.figure(figsize=(4, 3), dpi=80)
plt.plot(X, C, color="blue", linewidth=1.0, linestyle="-", label="cos")
plt.plot(X, S, color="green", linewidth=1.0, linestyle="-", label="sin")
plt.xlim(-4.0, 4.0)
plt.xticks(np.linspace(-4, 4, 9, endpoint=True))
plt.savefig("example.png", dpi=72)
plt.grid()
plt.xlabel("x")
plt.ylabel("y")
plt.title("Example")
plt.legend(loc="best")

<matplotlib.legend.Legend at 0x7fbd7c2d1850>


plt.figure(figsize=(6, 4))
plt.subplot(2, 2, 1)
plt.plot(X, C, color="blue", linewidth=1.0, linestyle="-", label="cos")
plt.subplot(2, 2, 2)
plt.plot(X, S, color="green", linewidth=1.0, linestyle="-", label="sin")
plt.subplot(2, 2, 3)
plt.plot(X, C, color="red", linewidth=1.0, linestyle="-", label="cos")
plt.subplot(2, 2, 4)
plt.plot(X, S, color="black", linewidth=1.0, linestyle="-", label="sin")
plt.show()


plt.rcdefaults()
fig, ax = plt.subplots(figsize=(4,3))
# Example data
people = ('Tom', 'Dick', 'Harry', 'Slim', 'Jim')
y_pos = np.arange(len(people))
performance = 3 + 10 * np.random.rand(len(people))
error = np.random.rand(len(people))
ax.barh(y_pos, performance, xerr=error, align='center',
        color='green', ecolor='black')
ax.set_yticks(y_pos)
ax.set_yticklabels(people)
ax.invert_yaxis()  # labels read top-to-bottom
ax.set_xlabel('Performance')
ax.set_title('How fast do you want to go today?')

plt.show()


x = np.linspace(0, 1, 500)
y = np.sin(4 * np.pi * x) * np.exp(-5 * x)
fig, ax = plt.subplots()
ax.fill(x, y, zorder=10)
ax.grid(True, zorder=5)
plt.show()


fig, ax = plt.subplots()
for color in ['red', 'green', 'blue']:
    n = 750
    x, y = np.random.rand(2, n)
    scale = 200.0 * np.random.rand(n)
    ax.scatter(x, y, c=color, s=scale, label=color,
               alpha=0.3, edgecolors='none')

ax.legend()
ax.grid(True)

plt.show()


mu = 200
sigma = 25
x = np.random.normal(mu, sigma, size=100)
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(6, 3))
ax0.hist(x, 20, density=1, histtype='stepfilled', facecolor='g', alpha=0.75)
ax0.set_title('stepfilled')
# Create a histogram by providing the bin edges (unequally spaced).
bins = [100, 150, 180, 195, 205, 220, 250, 300]
ax1.hist(x, bins, density=1, histtype='bar', rwidth=0.8)
ax1.set_title('unequal bins')
plt.title(r'Histogram of IQ: $\mu=100$, $\sigma=15$');


from matplotlib import colors, ticker, cm
from scipy.stats import multivariate_normal

N = 100
x = np.linspace(-3.0, 3.0, N)
y = np.linspace(-2.0, 2.0, N)

X, Y = np.meshgrid(x, y)
pos = np.empty(X.shape+(2,))
pos[:,:,0] = X; pos[:,:,1] = Y
# A low hump with a spike coming out of the top right.
# Needs to have z/colour axis on a log scale so we see both hump and spike.
# linear scale only shows the spike.
z = (multivariate_normal([0.1, 0.2], [[1.0, 0.],[0, 1.0]]).pdf(pos)
     + 0.1 * (multivariate_normal([1.0, 1.0],[[0.01, 0.],[0., 0.01]])).pdf(pos))
# Automatic selection of levels works; setting the
# log locator tells contourf to use a log scale:
fig, ax = plt.subplots(figsize=(4,3))
cs = ax.contourf(X, Y, z, locator=ticker.LogLocator(), cmap=cm.PuBu_r)
cbar = fig.colorbar(cs)


import numpy as np
import pandas as pd

s = pd.Series( [1., 2., 3., np.nan, 5. ], index=["a","b","c","d","e"])
s

a    1.0
b    2.0
c    3.0
d    NaN
e    5.0
dtype: float64


df = pd.DataFrame( 
    {
        'Col1': [1.,2.,3.,4.],
        'Col2': ["a","b","c","d"],
        'Col3': [True, False, True, True]
    }
)
df


df.dtypes

Col1    float64
Col2     object
Col3       bool
dtype: object


df.columns

Index(['Col1', 'Col2', 'Col3'], dtype='object')


df.index

RangeIndex(start=0, stop=4, step=1)


df = pd.DataFrame( {'A':np.random.randint(0,10,100), 'B': [2**x for x in np.arange(100)], 'C':"a"})
df.head()


df.tail(2)


df.describe()


dates = pd.date_range('20190527',periods=7)
df = pd.DataFrame( np.random.rand(7,4), index=dates, columns=['A','B','C','D'])
df


df['A'] # or df.A

2019-05-27    0.272727
2019-05-28    0.063530
2019-05-29    0.635973
2019-05-30    0.610504
2019-05-31    0.738883
2019-06-01    0.247118
2019-06-02    0.215886
Freq: D, Name: A, dtype: float64


df[0:2]


df['20190529':'20190531']


dates

DatetimeIndex(['2019-05-27', '2019-05-28', '2019-05-29', '2019-05-30',
               '2019-05-31', '2019-06-01', '2019-06-02'],
              dtype='datetime64[ns]', freq='D')


df.loc[dates[2]]

A    0.635973
B    0.778371
C    0.927694
D    0.971305
Name: 2019-05-29 00:00:00, dtype: float64


df.loc[dates[2],['B','C']]

B    0.778371
C    0.927694
Name: 2019-05-29 00:00:00, dtype: float64


df.iloc[2,1:3]

B    0.778371
C    0.927694
Name: 2019-05-29 00:00:00, dtype: float64


df[ df>0.5 ]


s = pd.Series( np.random.rand(7), index=dates )
s

2019-05-27    0.084900
2019-05-28    0.337333
2019-05-29    0.465100
2019-05-30    0.517110
2019-05-31    0.688339
2019-06-01    0.455821
2019-06-02    0.907693
Freq: D, dtype: float64


df['E'] = s
df


df.loc[:,['C']] = 0
df


df.mean()

A    0.397803
B    0.684547
C    0.000000
D    0.718920
E    0.493756
dtype: float64


df.mean(axis=1)

2019-05-27    0.380895
2019-05-28    0.397099
2019-05-29    0.570150
2019-05-30    0.613211
2019-05-31    0.500556
2019-06-01    0.363283
2019-06-02    0.387844
Freq: D, dtype: float64


df1 = pd.DataFrame( np.random.rand(7,2), index=dates, columns=['A','B'])
df2 = pd.DataFrame( np.random.rand(7,3), index=dates, columns=['C','D','E'])
pd.concat([df1,df2],sort=False)


pd.concat([df1,df2],axis=1,join='inner') #same syntax as for db (ineer, outer, left, right)


s = pd.Series( ["a","b","a","c","a","c","b"], index=dates)
df['E']=s
df


df.groupby('E').sum()


dates = pd.date_range('20190527',periods=6, name='date')
df = pd.DataFrame( np.random.rand(6,3), index=dates, columns=['A','B','C'])
df['D'] = pd.Series(["a","a","b","b","c","c"],index=dates)
df['E'] = pd.Series(["one","two","one","two","one","two"],index=dates)
df


pd.pivot_table(df, values=['A','B','C'], index=['D','E'])


%matplotlib inline
df.plot()

<AxesSubplot:xlabel='date'>

Format Type	Data Description	Reader	Writer
text	CSV	`read_csv`	`to_csv`
text	JSON	`read_json`	`to_json`
text	HTML	`read_html`	`to_html`
text	Local clipboard	`read_clipboard`	`to_clipboard`
binary	MS Excel	`read_excel`	`to_excel`
binary	HDF5 Format	`read_hdf`	`to_hdf`
binary	Feather Format	`read_feather`	`to_feather`
binary	Parquet Format	`read_parquet`	`to_parquet`
binary	Msgpack	`read_msgpack`	`to_msgpack`
binary	Stata	`read_stata`	`to_stata`
binary	SAS	`read_sas`
binary	Pickle Format	`read_pickle`	`to_pickle`
SQL	SQL	`read_sql`	`to_sql`
SQL	Google Big Query	`read_gbq`	`to_gbq`

	A	B
count	100.000000	1.000000e+02
mean	4.150000	-2.560000e+00
std	3.176222	1.070389e+18
min	0.000000	-9.223372e+18
25%	1.000000	0.000000e+00
50%	4.000000	6.144000e+03
75%	7.000000	1.717987e+11
max	9.000000	4.611686e+18

	A	B	C	D
2019-05-27	0.272727	0.782185	0.871513	0.764663
2019-05-28	0.063530	0.615451	0.009551	0.969182
2019-05-29	0.635973	0.778371	0.927694	0.971305
2019-05-30	0.610504	0.943599	0.580770	0.994840
2019-05-31	0.738883	0.543482	0.686757	0.532077
2019-06-01	0.247118	0.453881	0.635333	0.659595
2019-06-02	0.215886	0.674861	0.180832	0.140780

	A	B	C	D
2019-05-27	0.272727	0.782185	0.871513	0.764663
2019-05-28	0.063530	0.615451	0.009551	0.969182

	A	B	C	D	E
2019-05-27	0.933696	0.595231	0.021164	0.856063	0.664016
2019-05-28	0.743323	0.174635	0.987556	0.649094	0.277106
2019-05-29	0.534813	0.248670	0.545813	0.103561	0.637652
2019-05-30	0.515590	0.645915	0.478624	0.849072	0.540173
2019-05-31	0.034665	0.791540	0.230484	0.758756	0.457617
2019-06-01	0.408191	0.827338	0.061748	0.648818	0.480178
2019-06-02	0.333172	0.726970	0.643230	0.200065	0.312921

Foreword¶

Python Perugia INFN Course: Data Science tools¶

Scientific python stack¶

Numpy¶

Important: Views or copy¶

Boolean masks for extracting values¶

Values with list of indexes¶

Simple Operations¶

Reductions¶

Matplotlib¶

Customization¶

Multiple plots¶

Examples¶

Pandas¶

Reading/Saving dataframes¶

View data¶

Select data¶

Setting values¶

Operations¶

Merging dataframes¶

Grouping¶

Pivot table¶

Plotting data¶

	A	B	C	D
E
a	1.647583	2.104038	0.0	2.268045
b	0.279415	1.290312	0.0	1.109962
c	0.857622	1.397480	0.0	1.654435

	A	B	C	D	E
date
2019-05-27	0.829799	0.720495	0.173438	a	one
2019-05-28	0.252488	0.014775	0.521227	a	two
2019-05-29	0.646422	0.112209	0.143434	b	one
2019-05-30	0.841273	0.665084	0.762137	b	two
2019-05-31	0.780462	0.595384	0.826812	c	one
2019-06-01	0.551363	0.768771	0.079635	c	two