Problem Set 10 uses K-Means clustering to group counties into similar groups based on 14 demographic features such as income, housing, education, farm acres, age, population, population change and a several others.
The first task is to measure the sum of the squared error (the distance between the points in a cluster and the center of the cluster).
Next, we look at the effects of using increasing numbers of initial clusters (K). If the number of clusters were equal to the the number of points in the dataset, then SSE would be zero and we would be finished. This, however, does not tell us much (if anything) about the relationship between points. So choosing a good K number is important.
In the county dataset, we see that for increasing numbers of clusters (k-vals), the SSE is nearly always on a decreasing trend.
The final part of the problem is to weight some features so as to cluster tighter on the poverty rate without using that feature in the measurement. Weighting all features except poverty ends in about a 44,000 SSE at 125 clusters. But, weighting for income and high school graduation rate was a noticeable decrease, about 31,000.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 |
# Problem Set 10 # Name: Casey #Code shared across examples import pylab, random, string, copy, math # creates a point class to hold points, including names, attributes and methods class Point(object): def __init__(self, name, originalAttrs, normalizedAttrs = None): """normalizedAttrs and originalAttrs are both arrays""" self.name = name self.unNormalized = originalAttrs self.attrs = normalizedAttrs def dimensionality(self): return len(self.attrs) def getAttrs(self): return self.attrs def getOriginalAttrs(self): return self.unNormalized def distance(self, other): #Euclidean distance metric difference = self.attrs - other.attrs return sum(difference * difference) ** 0.5 def getName(self): return self.name def toStr(self): return self.name + str(self.attrs) def __str__(self): return self.name # county is a type of point ( a subclass ) just to handle weighing features class County(Point): weights = pylab.array([1.0] * 14) # Override Point.distance to use County.weights to decide the # significance of each dimension # when do custom weights get passed in? def distance(self, other): difference = self.getAttrs() - other.getAttrs() # sq root the sum of weights times differences in attributes gives dist # is this the Minkowski metric? return sum(County.weights * difference * difference) ** 0.5 class Cluster(object): def __init__(self, points, pointType): self.points = points self.pointType = pointType # using func below self.centroid = self.computeCentroid() def getCentroid(self): return self.centroid def computeCentroid(self): # dimensionality from superclass dim = self.points[0].dimensionality() # init totVals to 0.0 totVals = pylab.array([0.0]*dim) for p in self.points: totVals += p.getAttrs() meanPoint = self.pointType('mean', totVals/float(len(self.points)), totVals/float(len(self.points))) return meanPoint def update(self, points): oldCentroid = self.centroid self.points = points if len(points) > 0: self.centroid = self.computeCentroid() return oldCentroid.distance(self.centroid) else: return 0.0 def getPoints(self): return self.points def contains(self, name): for p in self.points: if p.getName() == name: return True return False def toStr(self): result = '' for p in self.points: result = result + p.toStr() + ', ' return result[:-2] def __str__(self): result = '' for p in self.points: result = result + str(p) + ', ' return result[:-2] def kmeans(points, k, cutoff, pointType, minIters = 3, maxIters = 100, toPrint = False): """ Returns (Cluster list, max dist of any point to its cluster) """ #Uses random initial centroids initialCentroids = random.sample(points,k) clusters = [] for p in initialCentroids: clusters.append(Cluster([p], pointType)) numIters = 0 biggestChange = cutoff while (biggestChange >= cutoff and numIters < maxIters) or numIters < minIters: print "Starting iteration " + str(numIters) newClusters = [] for c in clusters: newClusters.append([]) for p in points: smallestDistance = p.distance(clusters[0].getCentroid()) index = 0 for i in range(len(clusters)): distance = p.distance(clusters[i].getCentroid()) if distance < smallestDistance: smallestDistance = distance index = i newClusters[index].append(p) biggestChange = 0.0 for i in range(len(clusters)): change = clusters[i].update(newClusters[i]) biggestChange = max(biggestChange, change) numIters += 1 if toPrint: print 'Iteration count =', numIters maxDist = 0.0 for c in clusters: for p in c.getPoints(): if p.distance(c.getCentroid()) > maxDist: maxDist = p.distance(c.getCentroid()) print 'Total Number of iterations =', numIters, 'Max Diameter =', maxDist print biggestChange return clusters, maxDist #US Counties example def readCountyData(fName, numEntries = 14): dataFile = open(fName, 'r') dataList = [] nameList = [] maxVals = pylab.array([0.0]*numEntries) #Build unnormalized feature vector for line in dataFile: if len(line) == 0 or line[0] == '#': continue dataLine = string.split(line) name = dataLine[0] + dataLine[1] features = [] #Build vector with numEntries features for f in dataLine[2:]: try: f = float(f) features.append(f) if f > maxVals[len(features)-1]: maxVals[len(features)-1] = f except ValueError: name = name + f if len(features) != numEntries: continue dataList.append(features) nameList.append(name) return nameList, dataList, maxVals def buildCountyPoints(fName): """ Given an input filename, reads County values from the file and returns them all in a list. """ nameList, featureList, maxVals = readCountyData(fName) points = [] for i in range(len(nameList)): originalAttrs = pylab.array(featureList[i]) normalizedAttrs = originalAttrs/pylab.array(maxVals) points.append(County(nameList[i], originalAttrs, normalizedAttrs)) return points def randomPartition(l, p): """ Splits the input list into two partitions, where each element of l is in the first partition with probability p and the second one with probability (1.0 - p). l: The list to split p: The probability that an element of l will be in the first partition Returns: a tuple of lists, containing the elements of the first and second partitions. """ l1 = [] l2 = [] for x in l: if random.random() < p: l1.append(x) else: l2.append(x) return (l1,l2) def getAveIncome(cluster): """ Given a Cluster object, finds the average income field over the members of that cluster. cluster: the Cluster object to check Returns: a float representing the computed average income value """ tot = 0.0 numElems = 0 for c in cluster.getPoints(): tot += c.getOriginalAttrs()[1] return float(tot) / len(cluster.getPoints()) def getAvePoverty(cluster): tot = 0.0 numElems = 0 for c in cluster.getPoints(): tot += c.getOriginalAttrs()[2] return float(tot) / len(cluster.getPoints()) def test(points, k = 200, cutoff = 0.1): """ A sample function to show you how to do a simple kmeans run and graph the results. """ # but this does not use any weights, no? incomes = [] print '' clusters, maxSmallest = kmeans(points, k, cutoff, County) for i in range(len(clusters)): if len(clusters[i].points) == 0: continue incomes.append(getAveIncome(clusters[i])) pylab.hist(incomes) pylab.xlabel('Ave. Income') pylab.ylabel('Number of Clusters') pylab.show() # create a list of points for all counties points = buildCountyPoints('counties.txt') #random.seed(123) # create a sample using only 10% of the points from counties testPoints = random.sample(points, len(points)/10) # test(testPoints) # print testPoints def graphRemovedErr(points, kvals = [25, 50, 75, 100, 125, 150], cutoff = 0.1): """ Should produce graphs of the error in training and holdout point sets, and the ratio of the error of the points, after clustering for the given values of k. For details see Problem 1. """ # Your Code Here # 1.1 partition training and holdout sets training, holdout = randomPartition(points, 0.8) # 1.2 sum the distance of all points to their cluster's centroid t_list = [] h_list = [] ratios = [] for k in kvals: # find sum of squared dist of each point in training set clusters, maxSmallest = kmeans(training, k, cutoff, County) dists = [] for c in clusters: for p in c.getPoints(): dists.append(p.distance(c.getCentroid())**2) t_tot = sum(dists) t_list.append(t_tot) # find sum of squared dist of each point in training set clusters, maxSmallest = kmeans(holdout, k, cutoff, County) dists = [] for c in clusters: for p in c.getPoints(): dists.append(p.distance(c.getCentroid())**2) h_tot = sum(dists) h_list.append(h_tot) ratios.append(h_tot/t_tot) # 2.1 graph errors against k pylab.plot(h_list) pylab.plot(t_list) pylab.xlabel('k vals') pylab.xticks(range(len(kvals)), kvals) pylab.ylabel('Distance Error') pylab.show() # 2.2 graph ratio of error for holdout set and training set in separate plot pylab.plot(ratios) pylab.xlabel('k vals') pylab.xticks(range(len(kvals)), kvals) pylab.ylabel('holdout over training ratio') pylab.show() return t_list, h_list # a tuple of two lists # testPoints is too small for this @10% of Counties.txt w/ holdout of 20% #print graphRemovedErr(points) def kmeansAndYou(points, k = 50, cutoff = 0.1): for i in range(3): clusters, maxSmallest = kmeans(points, k, cutoff, County) for c in clusters: for p in c.getPoints(): if p.getName() == "AZCoconino": print c, "\n\n" #print kmeansAndYou(points) def graphPredictionErr(points, dimension, kvals = [25, 50, 75, 100, 125], cutoff = 0.1): """ Given input points and a dimension to predict, should cluster on the appropriate values of k and graph the error in the resulting predictions, as described in Problem 3. """ # TODO # weight features for p in points: p.attrs = dimension * p.getAttrs() hold_dist = [] for k in kvals: # find sum of squared dist of each cluster clusters, maxSmallest = kmeans(points, k, cutoff, County) poverty = [] for c in clusters: avePoverty = getAvePoverty(c) for p in c.getPoints(): ptPoverty = p.getOriginalAttrs()[2] poverty.append((ptPoverty - avePoverty)**2) tot = sum(poverty) hold_dist.append(tot) return hold_dist, kvals def plotAll(points, otherWeights, zeroWeighted, fullWeighted): featureBlind, kvals = graphPredictionErr(points, zeroWeighted) featurePredict, kvals = graphPredictionErr(points, otherWeights) pylab.plot(featurePredict, label="HS grad & Income") pylab.plot(featureBlind, label="All except Poverty %") pylab.xlabel("k vals") pylab.xticks(range(len(kvals)), kvals) pylab.ylabel("Sum Squared Error from Poverty") pylab.legend() pylab.show() ''' (('HomeVal', '1'), ('Income', '0'), ('Poverty', '1'), ('Population', '1'), ('Pop Change', '1'), ('Prcnt 65+', '1'), ('Below 18', '1'), ('Prcnt Female', '1'), ('Prcent HS Grad', '1'), ('Prcnt College', '0'), ('Unemployed', '0'), ('Prcent Below 18', '1'), ('Life Expect', '1'), ('Farm Acres', '1')) ''' otherWeights = [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0] zeroWeighted = [1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0] fullWeighted = [1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] plotAll(points, otherWeights, zeroWeighted, fullWeighted) |