[Note2] IDSP-Numpy


NumPy is a library for the Python programming language, adding support for large, multi-dimensional arrays and matrices, along with a large collection of high-level mathematical functions to operate on these arrays.
1 Create Array
import numpy as np
import math
b = np.array([[1,2,3],[4,5,6]])
b
array([[1, 2, 3],
[4, 5, 6]])
# Dimension of arraywhich returns a tuple
b.shape
(2, 3)
d = np.zeros((2,3))
print(d)
e = np.ones((2,3))
print(e)
[[0. 0. 0.]
[0. 0. 0.]]
[[1. 1. 1.]
[1. 1. 1.]]
# Random numbers
np.random.rand(2,3)
array([[0.9328578 , 0.07847829, 0.34655399],
[0.58607914, 0.12490093, 0.62996355]])
# Create an array of every even number from ten (inclusive) to fifty (exclusive)
f = np.arange(10, 50, 2)
f
array([10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42,
44, 46, 48])
# In this function the thirdargument is the total number of items you want to generate
np.linspace( 0, 2, 15 ) # 15 numbers from 0 (inclusive) to 2 (inclusive)
array([0. , 0.14285714, 0.28571429, 0.42857143, 0.57142857,
0.71428571, 0.85714286, 1. , 1.14285714, 1.28571429,
1.42857143, 1.57142857, 1.71428571, 1.85714286, 2. ])
2 Array Operations
a = np.array([10,20,30,40])
b = np.array([1, 2, 3,4])
# a times b
d = a*b
print(d)
[ 10 40 90 160]
#matrix a times b
e = a@b
print(e)
300
# convert a number of farenheit values to ceclius
farenheit = np.array([0,-10,-5,-15,0])
celcius = (farenheit - 31) * (5/9)
celcius
array([-17.22222222, -22.77777778, -20. , -25.55555556,
-17.22222222])
# Numpy arrays have many interesting aggregation functions.
array3 = np.array([[1,2,3],[4,5,6]])
print(array3.sum())
print(array3.max())
print(array3.min())
print(array3.mean())
21
6
1
3.5
# let's create an array with 15 elements, ranging from 1 to 15,
# with a dimension of 3X5
b = np.arange(1,16,1).reshape(3,5)
print(b)
[[ 1 2 3 4 5]
[ 6 7 8 9 10]
[11 12 13 14 15]]
3 Indexing, Slicing and Iterating
3.1 Indexing
a = np.array([1,3,5,7])
a[2]
5
# Remember in python we start at 0!
a = np.array([[1,2], [3, 4], [5, 6]])
a[1,1]
4
# Get multiple elements
np.array([a[0, 0], a[1, 1], a[2, 1]])
array([1, 4, 6])
# we can also do that by using another form of array indexing, which essentiall "zips" the first list and the
# second list up
#[0,0] [1,1] [2,1] a[[x-axis],[y-axis]]
print(a[[0, 1, 2], [0, 1, 1]])
[1 4 6]
3.2 Boolean Indexing
a = np.array([[1,2], [3, 4], [5, 6]])
print(a >5)
[[False False]
[False False]
[False True]]
# We can then place this array of booleans like a mask over the original array to return a one-dimensional
# array relating to the true values.
print(a[a>5])
[6]
As we will see, this functionality is essential in the pandas toolkit
3.3 Slicing
# Get elements from index 0 to index 3 (excluding index 3)
a = np.array([0,1,2,3,4,5])
print(a[:3])
[0 1 2]
# By putting 2:4 in the bracket, we get elements from index 2 to index 4 (excluding index 4)
print(a[2:4])
[2 3]
# For multi-dimensional arrays, get all the elements from the first (0th) and second row (1th)
a = np.array([[1,2,3,4], [5,6,7,8], [9,10,11,12]])
a[:2]
array([[1, 2, 3, 4],
[5, 6, 7, 8]])
#For Specific elements in array
#The first argument is for selecting rows
#The second argument is for selecting columns
a[:2, 1:3]
array([[2, 3],
[6, 7]])
sub_array = a[:2, 1:3]
sub_array
array([[2, 3],
[6, 7]])
# Modifying the sub array will consequently modify the original array
print("sub array index [0,0] value before change:", sub_array[0,0])
sub_array[0,0] = 50
print("sub array index [0,0] value after change:", sub_array[0,0])
print("original array index [0,1] value after change:", a[0,1])
sub array index [0,0] value before change: 2
sub array index [0,0] value after change: 50
original array index [0,1] value after change: 50
4 Trying Numpy with Datasets
# Here we have a very popular dataset on wine quality, and we are going to only look at red wines. The data
# fields include: fixed acidity, volatile aciditycitric acid, residual sugar, chlorides, free sulfur dioxide,
# total sulfur dioxidedensity, pH, sulphates, alcohol, quality
# To load a dataset in Numpy, we can use the genfromtxt() function. We can specify data file name, delimiter
# (which is optional but often used), and number of rows to skip if we have a header row, hence it is 1 here
# The genfromtxt() function has a parameter called dtype for specifying data types of each column this
# parameter is optional. Without specifying the types, all types will be casted the same to the more
# general/precise type
wines = np.genfromtxt("datasets/winequality-red.csv", delimiter=";", skip_header=1)
wines
array([[ 7.4 , 0.7 , 0. , ..., 0.56 , 9.4 , 5. ],
[ 7.8 , 0.88 , 0. , ..., 0.68 , 9.8 , 5. ],
[ 7.8 , 0.76 , 0.04 , ..., 0.65 , 9.8 , 5. ],
...,
[ 6.3 , 0.51 , 0.13 , ..., 0.75 , 11. , 6. ],
[ 5.9 , 0.645, 0.12 , ..., 0.71 , 10.2 , 5. ],
[ 6. , 0.31 , 0.47 , ..., 0.66 , 11. , 6. ]])
# Recall that we can use integer indexing to get a certain column or a row. For example, if we want to select
# the fixed acidity column, which is the first coluumn, we can do so by entering the index into the array.
# Also remember that for multidimensional arrays, the first argument refers to the row, and the second
# argument refers to the column, and if we just give one argument then we'll get a single dimensional list
# back.
# So all rows combined but only the first column from them would be
print("one integer 0 for slicing: ", wines[:, 0])
# But if we wanted the same values but wanted to preserve that they sit in their own rows we would write
print("0 to 1 for slicing: \n", wines[:, 0:1])
one integer 0 for slicing: [7.4 7.8 7.8 ... 6.3 5.9 6. ]
0 to 1 for slicing:
[[7.4]
[7.8]
[7.8]
...
[6.3]
[5.9]
[6. ]]
# This is another great example of how the shape of the data is an abstraction which we can layer
# intentionally on top of the data we are working with.
# If we want a range of columns in order, say columns 0 through 3 (recall, this means first, second, and
# third, since we start at zero and don't include the training index value), we can do that too
wines[:, 0:3]
array([[7.4 , 0.7 , 0. ],
[7.8 , 0.88 , 0. ],
[7.8 , 0.76 , 0.04 ],
...,
[6.3 , 0.51 , 0.13 ],
[5.9 , 0.645, 0.12 ],
[6. , 0.31 , 0.47 ]])
# What if we want several non-consecutive columns? We can place the indices of the columns that we want into
# an array and pass the array as the second argument. Here's an example
wines[:, [0,2,4]]
array([[7.4 , 0. , 0.076],
[7.8 , 0. , 0.098],
[7.8 , 0.04 , 0.092],
...,
[6.3 , 0.13 , 0.076],
[5.9 , 0.12 , 0.075],
[6. , 0.47 , 0.067]])
# We can also do some basic summarization of this dataset. For example, if we want to find out the average
# quality of red wine, we can select the quality column. We could do this in a couple of ways, but the most
# appropriate is to use the -1 value for the index, as negative numbers mean slicing from the back of the
# list. We can then call the aggregation functions on this data.
wines[:,-1].mean()
5.6360225140712945
# Let's take a look at another dataset, this time on graduate school admissions. It has fields such as GRE
# score, TOEFL score, university rating, GPA, having research experience or not, and a chance of admission.
# With this dataset, we can do data manipulation and basic analysis to infer what conditions are associated
# with higher chance of admission. Let's take a look.
# We can specify data field names when using genfromtxt() to loads CSV data. Also, we can have numpy try and
# infer the type of a column by setting the dtype parameter to None
graduate_admission = np.genfromtxt('datasets/Admission_Predict.csv', dtype=None, delimiter=',', skip_header=1,
names=('Serial No','GRE Score', 'TOEFL Score', 'University Rating', 'SOP',
'LOR','CGPA','Research', 'Chance of Admit'))
graduate_admission
array([( 1, 337, 118, 4, 4.5, 4.5, 9.65, 1, 0.92),
( 2, 324, 107, 4, 4. , 4.5, 8.87, 1, 0.76),
( 3, 316, 104, 3, 3. , 3.5, 8. , 1, 0.72),
...
(398, 330, 116, 4, 5. , 4.5, 9.45, 1, 0.91),
(399, 312, 103, 3, 3.5, 4. , 8.78, 0, 0.67),
(400, 333, 117, 4, 5. , 4. , 9.66, 1, 0.95)],
dtype=[('Serial_No', '
# Notice that the resulting array is actually a one-dimensional array with 400 tuples
graduate_admission.shape
(400,)
# We can retrieve a column from the array using the column's name for example, let's get the CGPA column and
# only the first five values.
graduate_admission['CGPA'][0:5]
array([9.65, 8.87, 8. , 8.67, 8.21])
# Since the GPA in the dataset range from 1 to 10, and in the US it's more common to use a scale of up to 4,
# a common task might be to convert the GPA by dividing by 10 and then multiplying by 4
graduate_admission['CGPA'] = graduate_admission['CGPA'] /10 *4
graduate_admission['CGPA'][0:20] #let's get 20 values
array([3.86 , 3.548, 3.2 , 3.468, 3.284, 3.736, 3.28 , 3.16 , 3.2 ,
3.44 , 3.36 , 3.6 , 3.64 , 3.2 , 3.28 , 3.32 , 3.48 , 3.2 ,
3.52 , 3.4 ])
# Recall boolean masking. We can use this to find out how many students have had research experience by
# creating a boolean mask and passing it to the array indexing operator
len(graduate_admission[graduate_admission['Research'] == 1])
219
# Since we have the data field chance of admission, which ranges from 0 to 1, we can try to see if students
# with high chance of admission (>0.8) on average have higher GRE score than those with lower chance of
# admission (<0.4)
# So first we use boolean masking to pull out only those students we are interested in based on their chance
# of admission, then we pull out only their GPA scores, then we print the mean values.
print(graduate_admission[graduate_admission['Chance_of_Admit'] > 0.8]['GRE_Score'].mean())
print(graduate_admission[graduate_admission['Chance_of_Admit'] < 0.4]['GRE_Score'].mean())
328.7350427350427
302.2857142857143
# Take a moment to reflect here, do you understand what is happening in these calls?
# When we do the boolean masking we are left with an array with tuples in it still, and numpy holds underneath
# this a list of the columns we specified and their name and indexes
graduate_admission[graduate_admission['Chance_of_Admit'] > 0.8]
array([( 1, 337, 118, 4, 4.5, 4.5, 3.86 , 1, 0.92),
( 6, 330, 115, 5, 4.5, 3. , 3.736, 1, 0.9 ),
( 12, 327, 111, 4, 4. , 4.5, 3.6 , 1, 0.84),
...
(396, 324, 110, 3, 3.5, 3.5, 3.616, 1, 0.82),
(397, 325, 107, 3, 3. , 3.5, 3.644, 1, 0.84),
(398, 330, 116, 4, 5. , 4.5, 3.78 , 1, 0.91),
(400, 333, 117, 4, 5. , 4. , 3.864, 1, 0.95)],
dtype=[('Serial_No', '
# Let's also do this with GPA
print(graduate_admission[graduate_admission['Chance_of_Admit'] > 0.8]['CGPA'].mean())
print(graduate_admission[graduate_admission['Chance_of_Admit'] < 0.4]['CGPA'].mean())
3.7106666666666666
3.0222857142857142
# Hrm, well, I guess one could have expected this. The GPA and GRE for students who have a higher chance of
# being admitted, at least based on our cursory look here, seems to be higher.
5 Numpy + Image
from PIL import Image
from IPython.display import display
im = Image.open('chris.tiff')
display(im)

# Convert this PIL image to a numpy array
array=np.array(im)
print(array.shape)
array
(200, 200)
array([[118, 117, 118, ..., 103, 107, 110],
[113, 113, 113, ..., 100, 103, 106],
[108, 108, 107, ..., 95, 98, 102],
...,
[177, 181, 182, ..., 193, 198, 192],
[178, 182, 183, ..., 193, 201, 189],
[178, 182, 184, ..., 193, 201, 187]], dtype=uint8)
# Let's create an array the same shape
mask=np.full(array.shape,255)
mask
array([[255, 255, 255, ..., 255, 255, 255],
[255, 255, 255, ..., 255, 255, 255],
[255, 255, 255, ..., 255, 255, 255],
...,
[255, 255, 255, ..., 255, 255, 255],
[255, 255, 255, ..., 255, 255, 255],
[255, 255, 255, ..., 255, 255, 255]])
# Subtract that from the modified array
modified_array=array-mask
# Convert all of the negative values to positive values
modified_array=modified_array*-1
# Set the value of the datatype correctly
modified_array=modified_array.astype(np.uint8)
modified_array
array([[137, 138, 137, ..., 152, 148, 145],
[142, 142, 142, ..., 155, 152, 149],
[147, 147, 148, ..., 160, 157, 153],
...,
[ 78, 74, 73, ..., 62, 57, 63],
[ 77, 73, 72, ..., 62, 54, 66],
[ 77, 73, 71, ..., 62, 54, 68]], dtype=uint8)
# Display this new array using the fromarray() function in the python
display(Image.fromarray(modified_array))

# Think of this as a giant array of bytes, and that the shape was an abstraction.
# Reshape the array and still try and render it.
reshaped=np.reshape(modified_array,(100,400))
print(reshaped.shape)
display(Image.fromarray(reshaped))
(100, 400)
