#suppress warnings for a clean notebook just to moderate error messages
import warnings
warnings.filterwarnings('ignore')


#import libraries
#load our data

import pandas as pd
df = pd.read_csv(r'C:\Users\PC\Desktop\Project excel\kingc.csv')
df.head()


#Data information eg formats
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319326 entries, 0 to 319325
Data columns (total 11 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   Locator           319326 non-null  object 
 1   WaterYear         319326 non-null  int64  
 2   WQI               319326 non-null  float64
 3   Month             319326 non-null  int64  
 4   ParameterGroup    319326 non-null  object 
 5   lng               319326 non-null  float64
 6   lat               319326 non-null  float64
 7   MostRecentSample  319326 non-null  bool   
 8   SiteName          319326 non-null  object 
 9   StreamName        319326 non-null  object 
 10  WQI_binned        319326 non-null  int64  
dtypes: bool(1), float64(3), int64(3), object(4)
memory usage: 24.7+ MB


#Count items at Locator           
item_counts = df['Locator'].value_counts()
print(item_counts)

3106             6981
B319             6772
470              6593
430              6579
474              6564
                 ... 
A670             1441
BSE_1MUDMTNRD    1313
B499              946
A687              648
0484A             321
Name: Locator, Length: 78, dtype: int64


#Count items at ParameterGroup
item_counts = df['ParameterGroup'].value_counts()
print(item_counts)

Temperature               28819
FecalBacteria             28717
Sediment                  28194
Nutrient                  28172
Total.Phosphorus          28150
Total.Suspended.Solids    28059
pH                        27562
Turbidity                 27109
MonthlyScore              26734
Penalty                   26362
Total.Nitrogen            20669
DissolvedOxygen           18287
AnnualScore                2492
Name: ParameterGroup, dtype: int64


#Count items at SiteName
item_counts = df['SiteName'].value_counts()

print(item_counts)

Green River at Starfire Way                      6981
Green River at 212th Way SE                      6772
Swamp Creek mouth at Bothell Way NE              6593
Lyon Creek mouth at Bothell Way NE               6579
North Creek mouth at Sammamish River Trail       6564
                                                 ... 
Laughin Jacobs mouth at E Lake Sammamish Pkwy    1441
Boise Creek mouth at SE Mud Mountain Rd          1313
Yarrow Creek near 101st Way NE                    946
Zackuse Creek                                     648
Bear Creek upstream of mouth                      321
Name: SiteName, Length: 78, dtype: int64


#Count items at StreamName
item_counts = df['StreamName'].value_counts()
print(item_counts)

Green                       25473
Issaquah                    17441
Bear                        16244
Pipers                      13054
Evans                       10825
Juanita                     10538
North                        9717
Swamp                        9362
Little Bear                  8967
Cedar                        8764
Sammamish                    8726
Crisp                        8523
Lyon                         6579
Newaukum                     6554
Thornton                     6477
May                          6461
Soos                         6353
Springbrook/Black River      6340
Jenkins                      6275
McAleer                      5810
Mill                         5806
Coal                         5802
Little Soos                  5796
Mercer Slough                5792
Cottage Lake Creek           5739
Forbes                       5736
Covington                    5696
Cochran Springs              5542
Venema                       4476
Pine Lake                    4433
Longfellow                   4286
Idylwood                     3722
Lewis                        3704
Ebright                      3561
George Davis                 3536
Tibbetts                     3379
Judd                         2467
Shinglemill                  2292
Mileta                       2292
Fisher                       2288
Gorsuch                      2073
Cherry Creek                 2003
Harris Creek                 1959
Ames Creek                   1959
Patterson Creek              1959
Ravensdale                   1881
Rock                         1871
Griffin Creek                1862
Snoqualmie - South Fork      1861
Snoqualmie - Middle Fork     1861
Snoqualmie - North Fork      1861
Tolt River                   1861
Snoqualmie                   1860
Raging River                 1859
Skykomish                    1850
Tahlequah                    1570
Laughing Jacobs              1441
Boise                        1313
Yarrow                        946
Zackuse                       648
Name: StreamName, dtype: int64


#Count items at WaterYear         
item_counts = df['WaterYear'].value_counts()

print(item_counts)

2021    11624
2022    11608
2019    11561
2018    11455
2017    11361
2016    11285
2015    11218
2020    10192
2007     8776
2008     8717
2014     8699
2005     7954
2006     7896
2004     7601
2003     7552
1999     7452
2013     7340
2001     7305
2002     7249
1998     7076
2000     7007
1997     6742
1996     6413
1995     6263
2023     6225
1994     5944
2012     5706
1993     5535
2009     5236
1988     4961
1992     4943
1989     4896
2011     4860
1987     4782
1982     4774
1981     4770
1990     4559
1985     4534
1980     4520
1984     4508
1991     4438
1983     4205
1986     4158
1979     4016
2010     3766
1977     1803
1978     1442
1976     1010
1973      969
1972      928
1974      789
1975      476
1971      149
1970       78
Name: WaterYear, dtype: int64


#Count items at WQI_binned
item_counts = df['WQI_binned'].value_counts()

print(item_counts)

3    222573
2     66488
1     30265
Name: WQI_binned, dtype: int64


item_counts = df['Month'].value_counts()

print(item_counts)

3     25511
2     25432
12    24918
13    24820
9     24722
7     24615
10    24595
8     24446
1     24406
6     24356
5     24285
4     23841
11    23379
Name: Month, dtype: int64


# rread oour column of interest
df = df[['WQI']]
df.head()

	WQI
0	70.93
1	61.14
2	74.90
3	75.38
4	83.90

	Locator	WaterYear	WQI	Month	ParameterGroup	lng	lat	MostRecentSample	SiteName	StreamName	WQI_binned
0	311	1970	70.93	13	AnnualScore	-122.2479	47.4655	False	Green River at Interurban	Green	2
1	311	1971	61.14	13	AnnualScore	-122.2479	47.4655	False	Green River at Interurban	Green	2
2	311	1972	74.90	13	AnnualScore	-122.2479	47.4655	False	Green River at Interurban	Green	2
3	311	1973	75.38	13	AnnualScore	-122.2479	47.4655	False	Green River at Interurban	Green	2
4	311	1974	83.90	13	AnnualScore	-122.2479	47.4655	False	Green River at Interurban	Green	3

Data exploration¶