Merge pull request #2 from l1m5/master

Added simple python script to handle automatic updates and merges
2026-07-01 10:46:51 +00:00 · 2013-02-20 20:18:52 -08:00
parent 68a566a932 ddca2396b6
commit 4f9db5274c
9 changed files with 26353 additions and 9 deletions
--- a/data/StevenBlack/update.info
+++ b/data/StevenBlack/update.info
@@ -0,0 +1 @@
+https://raw.github.com/StevenBlack/hosts/master/data/StevenBlack/hosts
--- a/data/malwaredomainlist.com/hosts
+++ b/data/malwaredomainlist.com/hosts
--- a/data/malwaredomainlist.com/update.info
+++ b/data/malwaredomainlist.com/update.info
@@ -0,0 +1 @@
+http://www.malwaredomainlist.com/hostslist/hosts.txt
--- a/data/mvps.org/update.info
+++ b/data/mvps.org/update.info
@@ -0,0 +1 @@
+http://winhelp2002.mvps.org/hosts.txt
--- a/data/someonewhocares.org/hosts
+++ b/data/someonewhocares.org/hosts
@@ -6,7 +6,7 @@
 # Please forward any additions, corrections or comments by email to
 # hosts@someonewhocares.org

-# Last updated: Jan 30th, 2013 at 23:35
+# Last updated: Feb 17th, 2013 at 13:37

 # Use this file to prevent your computer from connecting to selected
 # internet hosts. This is an easy and effective way to protect you from 
@@ -143,6 +143,7 @@ fe80::1%lo0     localhost
 127.0.0.1 prolapseman.com
 127.0.0.1 scrollbelow.com
 127.0.0.1 selfpwn.org
+127.0.0.1 sexitnow.com
 127.0.0.1 sourmath.com
 127.0.0.1 suckdude.com
 127.0.0.1 thatsjustgay.com
@@ -215,12 +216,14 @@ fe80::1%lo0     localhost
 127.0.0.1 www.phonejapan.com
 127.0.0.1 www.pressurespot.com
 127.0.0.1 www.prolapseman.com
+127.0.0.1 www.punishtube.com
 127.0.0.1 www.scrollbelow.com
 127.0.0.1 www.selfpwn.org
 127.0.0.1 www.sourmath.com
 127.0.0.1 www.suckdude.com
 127.0.0.1 www.thatsjustgay.com
 127.0.0.1 www.thatsphucked.com
+127.0.0.1 www.theexgirlfriends.com
 127.0.0.1 www.thehomo.org
 127.0.0.1 www.themacuser.org
 127.0.0.1 www.thepounder.com
@@ -231,7 +234,10 @@ fe80::1%lo0     localhost
 127.0.0.1 www.walkthedinosaur.com
 127.0.0.1 www.whipcrack.org
 127.0.0.1 www.wormgush.com
+127.0.0.1 www.xvideoslive.com
+127.0.0.1 www.y8.com
 127.0.0.1 www.youaresogay.com
+127.0.0.1 www.ypmate.com
 127.0.0.1 youaresogay.com
 #</shock-sites>

@@ -905,7 +911,6 @@ fe80::1%lo0     localhost
 127.0.0.1 metrics.premiere.com
 127.0.0.1 metrics.rottentomatoes.com
 127.0.0.1 metrics.sephora.com
-127.0.0.1 metrics.skype.com
 127.0.0.1 metrics.soundandvision.com
 127.0.0.1 metrics.soundandvisionmag.com
 127.0.0.1 metrics.sun.com
@@ -1147,7 +1152,6 @@ fe80::1%lo0     localhost
 127.0.0.1 stat1.z-stat.com
 127.0.0.1 stat3.cybermonitor.com
 127.0.0.1 statcounter.com
-127.0.0.1 static.chartbeat.com
 127.0.0.1 static.kibboko.com
 127.0.0.1 static.smni.com		# Santa Monica - popunders
 127.0.0.1 statik.topica.com
@@ -2436,6 +2440,7 @@ fe80::1%lo0     localhost
 #<2o7-sites>

 # 2o7.net -- server side tracking
+#127.0.0.1 appleglobal.112.2o7.net	#breaks apple.com
 127.0.0.1 102.112.2o7.net
 127.0.0.1 102.122.2o7.net
 127.0.0.1 112.2o7.net
@@ -4105,6 +4110,7 @@ fe80::1%lo0     localhost
 127.0.0.1 ads1.globeandmail.com
 127.0.0.1 ads1.itadnetwork.co.uk
 127.0.0.1 ads1.jev.co.za
+127.0.0.1 ads1.msads.net
 127.0.0.1 ads1.msn.com
 127.0.0.1 ads1.perfadbrite.com.akadns.net
 127.0.0.1 ads1.performancingads.com
@@ -4763,7 +4769,6 @@ fe80::1%lo0     localhost
 127.0.0.1 api-public.addthis.com
 127.0.0.1 api.addthis.com
 127.0.0.1 api.affinesystems.com
-127.0.0.1 api.chartbeat.com
 127.0.0.1 apopt.hbmediapro.com
 127.0.0.1 app.scanscout.com
 127.0.0.1 apparel-offer.com
@@ -5862,6 +5867,7 @@ fe80::1%lo0     localhost
 127.0.0.1 img.layer-ads.de
 127.0.0.1 img.sn00.net
 127.0.0.1 img.soulmate.com
+127.0.0.1 img.xnxx.com
 127.0.0.1 img4.cdn.adjuggler.com
 127.0.0.1 imgn.dt07.com
 127.0.0.1 imgserv.adbutler.com
@@ -6211,6 +6217,7 @@ fe80::1%lo0     localhost
 127.0.0.1 msnbe-hp.metriweb.be
 127.0.0.1 mt58.mtree.com
 127.0.0.1 mu-in-f167.1e100.net
+127.0.0.1 multi.xnxx.com
 127.0.0.1 mvonline.com
 127.0.0.1 mx.adserver.yahoo.com
 127.0.0.1 my-reward-channel.com
@@ -6451,7 +6458,6 @@ fe80::1%lo0     localhost
 127.0.0.1 phpadsnew.gamefolk.de
 127.0.0.1 phpadsnew.wn.com
 127.0.0.1 pick-savings.com
-127.0.0.1 ping.chartbeat.net
 127.0.0.1 pink.habralab.ru
 127.0.0.1 pix01.revsci.net
 127.0.0.1 pix521.adtech.de
@@ -6690,7 +6696,6 @@ fe80::1%lo0     localhost
 127.0.0.1 secure.bidvertiserr.com
 127.0.0.1 secure.eloqua.com
 127.0.0.1 secure.gaug.es
-127.0.0.1 secure.skypeassets.com
 127.0.0.1 secure.webconnect.net
 127.0.0.1 secureads.ft.com
 127.0.0.1 securecontactinfo.com
@@ -6874,7 +6879,6 @@ fe80::1%lo0     localhost
 127.0.0.1 static.vpptechnologies.com
 127.0.0.1 static.way2traffic.com
 127.0.0.1 static1.influads.com
-127.0.0.1 static2.chartbeat.com
 127.0.0.1 staticads.btopenworld.com
 127.0.0.1 staticb.mydirtyhobby.com
 127.0.0.1 statistik-gallup.dk
@@ -6899,6 +6903,7 @@ fe80::1%lo0     localhost
 127.0.0.1 suresafe1.adsovo.com
 127.0.0.1 surplus-suppliers.com
 127.0.0.1 survey.112.2o7.net
+127.0.0.1 surveycentral.directinsure.info
 127.0.0.1 surveygizmo.com
 127.0.0.1 surveymonkeycom.tt.omtrdc.net
 127.0.0.1 surveypass.com
@@ -7427,6 +7432,7 @@ fe80::1%lo0     localhost
 127.0.0.1 www.freecameraprovider.com
 127.0.0.1 www.freecamerasource.com
 127.0.0.1 www.freecamerauk.co.uk
+127.0.0.1 www.freecamsecrets.com
 127.0.0.1 www.freecoolgift.com
 127.0.0.1 www.freedesignerhandbagreviews.com
 127.0.0.1 www.freedinnersource.com
@@ -7871,6 +7877,7 @@ fe80::1%lo0     localhost
 127.0.0.1 www3.addthis.com
 127.0.0.1 www3.adireland.com
 127.0.0.1 www3.bannerspace.com
+127.0.0.1 www3.game-advertising-online.com
 127.0.0.1 www30.glam.com
 127.0.0.1 www30a1-orig.glam.com
 127.0.0.1 www30a1.glam.com
--- a/data/someonewhocares.org/update.info
+++ b/data/someonewhocares.org/update.info
@@ -0,0 +1 @@
+http://someonewhocares.org/hosts/hosts
--- a/24848
+++ b/24848
--- a/readme.md
+++ b/readme.md
@@ -12,6 +12,17 @@ Currently the `hosts` files from the following locations are amalgamated:
 * Dan Pollock at [http://someonewhocares.org/hosts/](http://someonewhocares.org/hosts/) updated regularly.
 * My own small list in raw form [here](https://raw.github.com/StevenBlack/hosts/master/data/StevenBlack/hosts).

+You can add any additional sources you'd like under the data/ directory. Provide a copy of the current `hosts` file and a file called
+update.info with the URL to the `hosts` file source. This will allow updateHostsFile.py to automatically update your source.
+
+## Using updateHostsFile.py
+
+This Python script will generate a unique hosts file based on the sources provided. You can either have the script go out and fetch an updated version over the web (defined by the update.info text file in the source's directory), or it will use the `hosts` file you already have checked into your source's data folder.
+
+Usage
+
+    python updateHostsFile.py
+
 ## What is a hosts file?

 A hosts file, named `hosts` (with no file extension), is a plain-text file used by all operating systems to map hostnames to IP addresses. 
@@ -35,8 +46,32 @@ For example, to nullify requests to some doubleclick.net servers, adding these l
 ## Location of your hosts file
 To modify your current `hosts` file, look for it in the following places and modify it with a text editor.

-**Mac OS X, iOS, Android**: `/etc/hosts` folder.
+**Mac OS X, iOS, Android, Linux**: `/etc/hosts` folder.

 **Windows**: `%SystemRoot%\system32\drivers\etc\hosts` folder.

+## Reloading hosts file
+Your operating system will cache DNS lookups. You can either reboot or run the following commands to manually flush your DNS cache once the new hosts file is in place.

+### Mac OS X
+Open a Terminal and run:
+
+`dscacheutil -flushcache`
+
+### Windows
+Open a Command Prompt:
+
+**Windows XP**: Start -> Run -> `cmd`
+
+**Windows Vista, 7**: Start Button -> type `cmd` -> right-click Command Prompt -> "Run as Administrator"
+
+**Windows 8**: Start -> Swipe Up -> All Apps -> Windows System -> right-click Command Prompt -> "Run as Administrator"
+
+and run:
+
+`ipconfig /flushdns`
+
+### Linux
+Open a Terminal and run:
+
+`/etc/rc.d/init.d/nscd restart`
--- a/updateHostsFile.py
+++ b/updateHostsFile.py
@@ -0,0 +1,255 @@
+#!/usr/bin/env python
+
+# Script by Ben Limmer
+# https://github.com/l1m5
+#
+# This simple Python script will combine all the host files you provide
+# as sources into one, unique host file to keep you internet browsing happy.
+
+import os
+import re
+import string
+import sys
+import tempfile
+import urllib2
+
+# Project Settings
+BASEDIR_PATH = os.path.dirname(os.path.realpath(__file__))
+DATA_PATH = BASEDIR_PATH + '/data'
+DATA_FILENAMES = 'hosts'
+UPDATE_URL_FILENAME = 'update.info'
+SOURCES = os.listdir(DATA_PATH)
+
+# Exclusions
+EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end
+
+# Common domains to exclude
+COMMON_EXCLUSIONS = ['hulu.com']
+
+# Global vars
+exclusionRegexs = []
+duplicatesRemoved = 0;
+
+def main():
+	promptForUpdate()
+	promptForExclusions()
+	mergeFile = createInitialFile()
+	finalFile = removeDups(mergeFile)
+	finalizeFile(finalFile)
+	printSuccess('Success! Your shiny new hosts file has been prepared.')
+
+# Prompt the User
+def promptForUpdate():
+	response = query_yes_no("Do you want to update all data sources?")
+	if (response == "yes"):
+		updateAllSources()
+	else:
+		print 'OK, we\'ll stick with what we\'ve  got locally.'
+
+def promptForExclusions():
+	response = query_yes_no("Do you want to exclude any domains?\n" +
+							"For example, hulu.com video streaming must be able to access " +
+							"its tracking and ad servers in order to play video.")
+	if (response == "yes"):
+		displayExclusionOptions()
+	else:
+		print 'OK, we won\'t exclude any domains.'
+
+def promptForMoreCustomExclusions():
+	response = query_yes_no("Do you have more domains you want to enter?")
+	if (response == "yes"):
+		return True
+	else:
+		return False
+# End Prompt the User
+
+# Exclusion logic
+def displayExclusionOptions():
+	for exclusionOption in COMMON_EXCLUSIONS:
+		response = query_yes_no("Do you want to exclude the domain " + exclusionOption + " ?")
+		if (response == "yes"):
+			excludeDomain(exclusionOption)
+		else:
+			continue
+	response = query_yes_no("Do you want to exclude any other domains?")
+	if (response == "yes"):
+		gatherCustomExclusions()
+
+def gatherCustomExclusions():
+	while True:
+		domainFromUser = raw_input("Enter the domain you want to exclude (e.g. facebook.com): ")
+		if (isValidDomainFormat(domainFromUser)):
+			excludeDomain(domainFromUser)
+		if (promptForMoreCustomExclusions() == False):
+			return
+
+def excludeDomain(domain):
+	exclusionRegexs.append(re.compile(EXCLUSION_PATTERN + domain))
+
+def matchesExclusions(strippedRule):
+	strippedDomain = strippedRule.split()[1]
+	for exclusionRegex in exclusionRegexs:
+		if exclusionRegex.search(strippedDomain):
+			return True
+	return False
+# End Exclusion Logic
+
+# Update Logic
+def updateAllSources():
+	for source in SOURCES:
+		updateURL = getUpdateURLFromFile(source)
+		if (updateURL == None):
+			continue;
+		print 'Updating source ' + source + ' from ' + updateURL
+		updatedFile = urllib2.urlopen(updateURL)
+		updatedFile = updatedFile.read()
+		updatedFile = string.replace( updatedFile, '\r', '' ) #get rid of carriage-return symbols
+
+		dataFile   = open(DATA_PATH + '/' + source + '/' + DATA_FILENAMES, 'w')
+		dataFile.write(updatedFile)
+		dataFile.close()
+
+def getUpdateURLFromFile(source):
+	pathToUpdateFile = DATA_PATH + '/' + source + '/' + UPDATE_URL_FILENAME
+	if os.path.exists(pathToUpdateFile):
+		updateFile = open(pathToUpdateFile, 'r')
+		retURL = updateFile.readline().strip()
+		updateFile.close()
+	else:
+		retURL = None
+		printFailure('Warning: Can\'t find the update file for source ' + source + '\n' +
+					 'Make sure that there\'s a file at ' + pathToUpdateFile)
+	return retURL
+# End Update Logic
+
+# File Logic
+def createInitialFile():
+	mergeFile = tempfile.NamedTemporaryFile()
+	for source in SOURCES:
+		curFile = open(DATA_PATH + '/' + source +'/' + DATA_FILENAMES, 'r')
+		mergeFile.write('\n# Begin ' + source + '\n')
+		mergeFile.write(curFile.read())
+		mergeFile.write('\n# End ' + source + '\n')
+	return mergeFile
+
+def removeDups(mergeFile):
+	global duplicatesRemoved
+	finalFile = open(BASEDIR_PATH + '/hosts', 'w+b')
+	mergeFile.seek(0) # reset file pointer
+
+	rules_seen = set()
+	for line in mergeFile.readlines():
+		if line[0].startswith("#") or line[0] == '\n':
+			finalFile.write(line) #maintain the comments for readability
+			continue
+		strippedRule = stripRule(line) #strip comments
+		if matchesExclusions(strippedRule):
+			continue
+		if strippedRule not in rules_seen:
+			finalFile.write(line)
+			rules_seen.add(strippedRule)
+		else:
+			duplicatesRemoved += 1
+
+	mergeFile.close()
+
+	printSuccess('Removed ' + str(duplicatesRemoved) + ' duplicates from the merged file')
+	return finalFile
+
+def finalizeFile(finalFile):
+	writeOpeningHeader(finalFile)
+	finalFile.close()
+
+# Some sources put comments around their rules, for accuracy we need to strip them
+# the comments are preserved in the output hosts file
+def stripRule(line):
+	splitLine = line.split()
+	if (len(splitLine) < 2) :
+		printFailure('A line in the hostfile is going to cause problems because it is nonstandard\n' +
+					 'The line reads ' + line + ' please check your data files. Maybe you have a comment without a #?')
+		sys.exit()
+	return splitLine[0] + ' ' + splitLine[1]
+
+def writeOpeningHeader(finalFile):
+	global duplicatesRemoved
+	finalFile.seek(0) #reset file pointer
+	fileContents = finalFile.read(); #save content
+	finalFile.seek(0) #write at the top
+	finalFile.write('# This file is a merged collection of hosts from reputable sources,\n')
+	finalFile.write('# with a dash of crowd sourcing via Github\n#\n')
+	finalFile.write('# Project home page: https://github.com/StevenBlack/hosts\n#\n')
+	finalFile.write('# Current sources:\n')
+	for source in SOURCES:
+		finalFile.write('#    ' + source + '\n')
+	finalFile.write('#\n')
+	finalFile.write('# Take Note:\n')
+	finalFile.write('# Merging these sources produced ' + str(duplicatesRemoved) + ' duplicates\n')
+	finalFile.write('# ===============================================================\n')
+	finalFile.write(fileContents)
+# End File Logic
+
+# Helper Functions
+## {{{ http://code.activestate.com/recipes/577058/ (r2)
+def query_yes_no(question, default="yes"):
+    """Ask a yes/no question via raw_input() and return their answer.
+    
+    "question" is a string that is presented to the user.
+    "default" is the presumed answer if the user just hits <Enter>.
+        It must be "yes" (the default), "no" or None (meaning
+        an answer is required of the user).
+
+    The "answer" return value is one of "yes" or "no".
+    """
+    valid = {"yes":"yes",   "y":"yes",  "ye":"yes",
+             "no":"no",     "n":"no"}
+    if default == None:
+        prompt = " [y/n] "
+    elif default == "yes":
+        prompt = " [Y/n] "
+    elif default == "no":
+        prompt = " [y/N] "
+    else:
+        raise ValueError("invalid default answer: '%s'" % default)
+
+    while 1:
+        sys.stdout.write(colorize(question, colors.PROMPT) + prompt)
+        choice = raw_input().lower()
+        if default is not None and choice == '':
+            return default
+        elif choice in valid.keys():
+            return valid[choice]
+        else:
+            printFailure("Please respond with 'yes' or 'no' "\
+                             "(or 'y' or 'n').\n")
+## end of http://code.activestate.com/recipes/577058/ }}}
+
+def isValidDomainFormat(domain):
+	if (domain == ''):
+		print "You didn\'t enter a domain. Try again."
+		return False
+	domainRegex = re.compile("www\d{0,3}[.]|https?")
+	if (domainRegex.match(domain)):
+		print "The domain " + domain + " is not valid. Do not include www.domain.com or http(s)://domain.com. Try again."
+		return False
+	else:
+		return True
+
+# Colors
+class colors:
+    PROMPT 	= '\033[94m'
+    SUCCESS = '\033[92m'
+    FAIL 	= '\033[91m'
+    ENDC 	= '\033[0m'
+
+def colorize(text, color):
+	return color + text + colors.ENDC
+
+def printSuccess(text):
+	print colorize(text, colors.SUCCESS)
+
+def printFailure(text):
+	print colorize(text, colors.FAIL)
+# End Helper Functions
+
+if __name__ == "__main__":
+	main()
				`@@ -0,0 +1 @@`
				`https://raw.github.com/StevenBlack/hosts/master/data/StevenBlack/hosts`
				`@@ -0,0 +1 @@`
				`http://www.malwaredomainlist.com/hostslist/hosts.txt`