Merge pull request #2 from l1m5/master

Added simple python script to handle automatic updates and merges
This commit is contained in:
Steven Black
2013-02-20 20:18:52 -08:00
9 changed files with 26353 additions and 9 deletions

View File

@@ -0,0 +1 @@
https://raw.github.com/StevenBlack/hosts/master/data/StevenBlack/hosts

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1 @@
http://www.malwaredomainlist.com/hostslist/hosts.txt

View File

@@ -0,0 +1 @@
http://winhelp2002.mvps.org/hosts.txt

View File

@@ -6,7 +6,7 @@
# Please forward any additions, corrections or comments by email to
# hosts@someonewhocares.org
# Last updated: Jan 30th, 2013 at 23:35
# Last updated: Feb 17th, 2013 at 13:37
# Use this file to prevent your computer from connecting to selected
# internet hosts. This is an easy and effective way to protect you from
@@ -143,6 +143,7 @@ fe80::1%lo0 localhost
127.0.0.1 prolapseman.com
127.0.0.1 scrollbelow.com
127.0.0.1 selfpwn.org
127.0.0.1 sexitnow.com
127.0.0.1 sourmath.com
127.0.0.1 suckdude.com
127.0.0.1 thatsjustgay.com
@@ -215,12 +216,14 @@ fe80::1%lo0 localhost
127.0.0.1 www.phonejapan.com
127.0.0.1 www.pressurespot.com
127.0.0.1 www.prolapseman.com
127.0.0.1 www.punishtube.com
127.0.0.1 www.scrollbelow.com
127.0.0.1 www.selfpwn.org
127.0.0.1 www.sourmath.com
127.0.0.1 www.suckdude.com
127.0.0.1 www.thatsjustgay.com
127.0.0.1 www.thatsphucked.com
127.0.0.1 www.theexgirlfriends.com
127.0.0.1 www.thehomo.org
127.0.0.1 www.themacuser.org
127.0.0.1 www.thepounder.com
@@ -231,7 +234,10 @@ fe80::1%lo0 localhost
127.0.0.1 www.walkthedinosaur.com
127.0.0.1 www.whipcrack.org
127.0.0.1 www.wormgush.com
127.0.0.1 www.xvideoslive.com
127.0.0.1 www.y8.com
127.0.0.1 www.youaresogay.com
127.0.0.1 www.ypmate.com
127.0.0.1 youaresogay.com
#</shock-sites>
@@ -905,7 +911,6 @@ fe80::1%lo0 localhost
127.0.0.1 metrics.premiere.com
127.0.0.1 metrics.rottentomatoes.com
127.0.0.1 metrics.sephora.com
127.0.0.1 metrics.skype.com
127.0.0.1 metrics.soundandvision.com
127.0.0.1 metrics.soundandvisionmag.com
127.0.0.1 metrics.sun.com
@@ -1147,7 +1152,6 @@ fe80::1%lo0 localhost
127.0.0.1 stat1.z-stat.com
127.0.0.1 stat3.cybermonitor.com
127.0.0.1 statcounter.com
127.0.0.1 static.chartbeat.com
127.0.0.1 static.kibboko.com
127.0.0.1 static.smni.com # Santa Monica - popunders
127.0.0.1 statik.topica.com
@@ -2436,6 +2440,7 @@ fe80::1%lo0 localhost
#<2o7-sites>
# 2o7.net -- server side tracking
#127.0.0.1 appleglobal.112.2o7.net #breaks apple.com
127.0.0.1 102.112.2o7.net
127.0.0.1 102.122.2o7.net
127.0.0.1 112.2o7.net
@@ -4105,6 +4110,7 @@ fe80::1%lo0 localhost
127.0.0.1 ads1.globeandmail.com
127.0.0.1 ads1.itadnetwork.co.uk
127.0.0.1 ads1.jev.co.za
127.0.0.1 ads1.msads.net
127.0.0.1 ads1.msn.com
127.0.0.1 ads1.perfadbrite.com.akadns.net
127.0.0.1 ads1.performancingads.com
@@ -4763,7 +4769,6 @@ fe80::1%lo0 localhost
127.0.0.1 api-public.addthis.com
127.0.0.1 api.addthis.com
127.0.0.1 api.affinesystems.com
127.0.0.1 api.chartbeat.com
127.0.0.1 apopt.hbmediapro.com
127.0.0.1 app.scanscout.com
127.0.0.1 apparel-offer.com
@@ -5862,6 +5867,7 @@ fe80::1%lo0 localhost
127.0.0.1 img.layer-ads.de
127.0.0.1 img.sn00.net
127.0.0.1 img.soulmate.com
127.0.0.1 img.xnxx.com
127.0.0.1 img4.cdn.adjuggler.com
127.0.0.1 imgn.dt07.com
127.0.0.1 imgserv.adbutler.com
@@ -6211,6 +6217,7 @@ fe80::1%lo0 localhost
127.0.0.1 msnbe-hp.metriweb.be
127.0.0.1 mt58.mtree.com
127.0.0.1 mu-in-f167.1e100.net
127.0.0.1 multi.xnxx.com
127.0.0.1 mvonline.com
127.0.0.1 mx.adserver.yahoo.com
127.0.0.1 my-reward-channel.com
@@ -6451,7 +6458,6 @@ fe80::1%lo0 localhost
127.0.0.1 phpadsnew.gamefolk.de
127.0.0.1 phpadsnew.wn.com
127.0.0.1 pick-savings.com
127.0.0.1 ping.chartbeat.net
127.0.0.1 pink.habralab.ru
127.0.0.1 pix01.revsci.net
127.0.0.1 pix521.adtech.de
@@ -6690,7 +6696,6 @@ fe80::1%lo0 localhost
127.0.0.1 secure.bidvertiserr.com
127.0.0.1 secure.eloqua.com
127.0.0.1 secure.gaug.es
127.0.0.1 secure.skypeassets.com
127.0.0.1 secure.webconnect.net
127.0.0.1 secureads.ft.com
127.0.0.1 securecontactinfo.com
@@ -6874,7 +6879,6 @@ fe80::1%lo0 localhost
127.0.0.1 static.vpptechnologies.com
127.0.0.1 static.way2traffic.com
127.0.0.1 static1.influads.com
127.0.0.1 static2.chartbeat.com
127.0.0.1 staticads.btopenworld.com
127.0.0.1 staticb.mydirtyhobby.com
127.0.0.1 statistik-gallup.dk
@@ -6899,6 +6903,7 @@ fe80::1%lo0 localhost
127.0.0.1 suresafe1.adsovo.com
127.0.0.1 surplus-suppliers.com
127.0.0.1 survey.112.2o7.net
127.0.0.1 surveycentral.directinsure.info
127.0.0.1 surveygizmo.com
127.0.0.1 surveymonkeycom.tt.omtrdc.net
127.0.0.1 surveypass.com
@@ -7427,6 +7432,7 @@ fe80::1%lo0 localhost
127.0.0.1 www.freecameraprovider.com
127.0.0.1 www.freecamerasource.com
127.0.0.1 www.freecamerauk.co.uk
127.0.0.1 www.freecamsecrets.com
127.0.0.1 www.freecoolgift.com
127.0.0.1 www.freedesignerhandbagreviews.com
127.0.0.1 www.freedinnersource.com
@@ -7871,6 +7877,7 @@ fe80::1%lo0 localhost
127.0.0.1 www3.addthis.com
127.0.0.1 www3.adireland.com
127.0.0.1 www3.bannerspace.com
127.0.0.1 www3.game-advertising-online.com
127.0.0.1 www30.glam.com
127.0.0.1 www30a1-orig.glam.com
127.0.0.1 www30a1.glam.com

View File

@@ -0,0 +1 @@
http://someonewhocares.org/hosts/hosts

24848
hosts

File diff suppressed because one or more lines are too long

View File

@@ -12,6 +12,17 @@ Currently the `hosts` files from the following locations are amalgamated:
* Dan Pollock at [http://someonewhocares.org/hosts/](http://someonewhocares.org/hosts/) updated regularly.
* My own small list in raw form [here](https://raw.github.com/StevenBlack/hosts/master/data/StevenBlack/hosts).
You can add any additional sources you'd like under the data/ directory. Provide a copy of the current `hosts` file and a file called
update.info with the URL to the `hosts` file source. This will allow updateHostsFile.py to automatically update your source.
## Using updateHostsFile.py
This Python script will generate a unique hosts file based on the sources provided. You can either have the script go out and fetch an updated version over the web (defined by the update.info text file in the source's directory), or it will use the `hosts` file you already have checked into your source's data folder.
Usage
python updateHostsFile.py
## What is a hosts file?
A hosts file, named `hosts` (with no file extension), is a plain-text file used by all operating systems to map hostnames to IP addresses.
@@ -35,8 +46,32 @@ For example, to nullify requests to some doubleclick.net servers, adding these l
## Location of your hosts file
To modify your current `hosts` file, look for it in the following places and modify it with a text editor.
**Mac OS X, iOS, Android**: `/etc/hosts` folder.
**Mac OS X, iOS, Android, Linux**: `/etc/hosts` folder.
**Windows**: `%SystemRoot%\system32\drivers\etc\hosts` folder.
## Reloading hosts file
Your operating system will cache DNS lookups. You can either reboot or run the following commands to manually flush your DNS cache once the new hosts file is in place.
### Mac OS X
Open a Terminal and run:
`dscacheutil -flushcache`
### Windows
Open a Command Prompt:
**Windows XP**: Start -> Run -> `cmd`
**Windows Vista, 7**: Start Button -> type `cmd` -> right-click Command Prompt -> "Run as Administrator"
**Windows 8**: Start -> Swipe Up -> All Apps -> Windows System -> right-click Command Prompt -> "Run as Administrator"
and run:
`ipconfig /flushdns`
### Linux
Open a Terminal and run:
`/etc/rc.d/init.d/nscd restart`

255
updateHostsFile.py Normal file
View File

@@ -0,0 +1,255 @@
#!/usr/bin/env python
# Script by Ben Limmer
# https://github.com/l1m5
#
# This simple Python script will combine all the host files you provide
# as sources into one, unique host file to keep you internet browsing happy.
import os
import re
import string
import sys
import tempfile
import urllib2
# Project Settings
BASEDIR_PATH = os.path.dirname(os.path.realpath(__file__))
DATA_PATH = BASEDIR_PATH + '/data'
DATA_FILENAMES = 'hosts'
UPDATE_URL_FILENAME = 'update.info'
SOURCES = os.listdir(DATA_PATH)
# Exclusions
EXCLUSION_PATTERN = '([a-zA-Z\d-]+\.){0,}' #append domain the end
# Common domains to exclude
COMMON_EXCLUSIONS = ['hulu.com']
# Global vars
exclusionRegexs = []
duplicatesRemoved = 0;
def main():
promptForUpdate()
promptForExclusions()
mergeFile = createInitialFile()
finalFile = removeDups(mergeFile)
finalizeFile(finalFile)
printSuccess('Success! Your shiny new hosts file has been prepared.')
# Prompt the User
def promptForUpdate():
response = query_yes_no("Do you want to update all data sources?")
if (response == "yes"):
updateAllSources()
else:
print 'OK, we\'ll stick with what we\'ve got locally.'
def promptForExclusions():
response = query_yes_no("Do you want to exclude any domains?\n" +
"For example, hulu.com video streaming must be able to access " +
"its tracking and ad servers in order to play video.")
if (response == "yes"):
displayExclusionOptions()
else:
print 'OK, we won\'t exclude any domains.'
def promptForMoreCustomExclusions():
response = query_yes_no("Do you have more domains you want to enter?")
if (response == "yes"):
return True
else:
return False
# End Prompt the User
# Exclusion logic
def displayExclusionOptions():
for exclusionOption in COMMON_EXCLUSIONS:
response = query_yes_no("Do you want to exclude the domain " + exclusionOption + " ?")
if (response == "yes"):
excludeDomain(exclusionOption)
else:
continue
response = query_yes_no("Do you want to exclude any other domains?")
if (response == "yes"):
gatherCustomExclusions()
def gatherCustomExclusions():
while True:
domainFromUser = raw_input("Enter the domain you want to exclude (e.g. facebook.com): ")
if (isValidDomainFormat(domainFromUser)):
excludeDomain(domainFromUser)
if (promptForMoreCustomExclusions() == False):
return
def excludeDomain(domain):
exclusionRegexs.append(re.compile(EXCLUSION_PATTERN + domain))
def matchesExclusions(strippedRule):
strippedDomain = strippedRule.split()[1]
for exclusionRegex in exclusionRegexs:
if exclusionRegex.search(strippedDomain):
return True
return False
# End Exclusion Logic
# Update Logic
def updateAllSources():
for source in SOURCES:
updateURL = getUpdateURLFromFile(source)
if (updateURL == None):
continue;
print 'Updating source ' + source + ' from ' + updateURL
updatedFile = urllib2.urlopen(updateURL)
updatedFile = updatedFile.read()
updatedFile = string.replace( updatedFile, '\r', '' ) #get rid of carriage-return symbols
dataFile = open(DATA_PATH + '/' + source + '/' + DATA_FILENAMES, 'w')
dataFile.write(updatedFile)
dataFile.close()
def getUpdateURLFromFile(source):
pathToUpdateFile = DATA_PATH + '/' + source + '/' + UPDATE_URL_FILENAME
if os.path.exists(pathToUpdateFile):
updateFile = open(pathToUpdateFile, 'r')
retURL = updateFile.readline().strip()
updateFile.close()
else:
retURL = None
printFailure('Warning: Can\'t find the update file for source ' + source + '\n' +
'Make sure that there\'s a file at ' + pathToUpdateFile)
return retURL
# End Update Logic
# File Logic
def createInitialFile():
mergeFile = tempfile.NamedTemporaryFile()
for source in SOURCES:
curFile = open(DATA_PATH + '/' + source +'/' + DATA_FILENAMES, 'r')
mergeFile.write('\n# Begin ' + source + '\n')
mergeFile.write(curFile.read())
mergeFile.write('\n# End ' + source + '\n')
return mergeFile
def removeDups(mergeFile):
global duplicatesRemoved
finalFile = open(BASEDIR_PATH + '/hosts', 'w+b')
mergeFile.seek(0) # reset file pointer
rules_seen = set()
for line in mergeFile.readlines():
if line[0].startswith("#") or line[0] == '\n':
finalFile.write(line) #maintain the comments for readability
continue
strippedRule = stripRule(line) #strip comments
if matchesExclusions(strippedRule):
continue
if strippedRule not in rules_seen:
finalFile.write(line)
rules_seen.add(strippedRule)
else:
duplicatesRemoved += 1
mergeFile.close()
printSuccess('Removed ' + str(duplicatesRemoved) + ' duplicates from the merged file')
return finalFile
def finalizeFile(finalFile):
writeOpeningHeader(finalFile)
finalFile.close()
# Some sources put comments around their rules, for accuracy we need to strip them
# the comments are preserved in the output hosts file
def stripRule(line):
splitLine = line.split()
if (len(splitLine) < 2) :
printFailure('A line in the hostfile is going to cause problems because it is nonstandard\n' +
'The line reads ' + line + ' please check your data files. Maybe you have a comment without a #?')
sys.exit()
return splitLine[0] + ' ' + splitLine[1]
def writeOpeningHeader(finalFile):
global duplicatesRemoved
finalFile.seek(0) #reset file pointer
fileContents = finalFile.read(); #save content
finalFile.seek(0) #write at the top
finalFile.write('# This file is a merged collection of hosts from reputable sources,\n')
finalFile.write('# with a dash of crowd sourcing via Github\n#\n')
finalFile.write('# Project home page: https://github.com/StevenBlack/hosts\n#\n')
finalFile.write('# Current sources:\n')
for source in SOURCES:
finalFile.write('# ' + source + '\n')
finalFile.write('#\n')
finalFile.write('# Take Note:\n')
finalFile.write('# Merging these sources produced ' + str(duplicatesRemoved) + ' duplicates\n')
finalFile.write('# ===============================================================\n')
finalFile.write(fileContents)
# End File Logic
# Helper Functions
## {{{ http://code.activestate.com/recipes/577058/ (r2)
def query_yes_no(question, default="yes"):
"""Ask a yes/no question via raw_input() and return their answer.
"question" is a string that is presented to the user.
"default" is the presumed answer if the user just hits <Enter>.
It must be "yes" (the default), "no" or None (meaning
an answer is required of the user).
The "answer" return value is one of "yes" or "no".
"""
valid = {"yes":"yes", "y":"yes", "ye":"yes",
"no":"no", "n":"no"}
if default == None:
prompt = " [y/n] "
elif default == "yes":
prompt = " [Y/n] "
elif default == "no":
prompt = " [y/N] "
else:
raise ValueError("invalid default answer: '%s'" % default)
while 1:
sys.stdout.write(colorize(question, colors.PROMPT) + prompt)
choice = raw_input().lower()
if default is not None and choice == '':
return default
elif choice in valid.keys():
return valid[choice]
else:
printFailure("Please respond with 'yes' or 'no' "\
"(or 'y' or 'n').\n")
## end of http://code.activestate.com/recipes/577058/ }}}
def isValidDomainFormat(domain):
if (domain == ''):
print "You didn\'t enter a domain. Try again."
return False
domainRegex = re.compile("www\d{0,3}[.]|https?")
if (domainRegex.match(domain)):
print "The domain " + domain + " is not valid. Do not include www.domain.com or http(s)://domain.com. Try again."
return False
else:
return True
# Colors
class colors:
PROMPT = '\033[94m'
SUCCESS = '\033[92m'
FAIL = '\033[91m'
ENDC = '\033[0m'
def colorize(text, color):
return color + text + colors.ENDC
def printSuccess(text):
print colorize(text, colors.SUCCESS)
def printFailure(text):
print colorize(text, colors.FAIL)
# End Helper Functions
if __name__ == "__main__":
main()