#!/usr/bin/python from config import directory from glob import glob import os, re, time, sgmllib spamwords = [ 'adipex', 'ambien', 'casino', 'cialis', 'clenbuterol', 'dianabol', 'hydrocodone', 'lesbian', 'levitra', 'phentermine', 'propecia', 'tramadol', 'ugg', 'valium', 'viagra', 'vicodin', 'xanax', 'xenical', ] class link(sgmllib.SGMLParser): def __init__(self, body): sgmllib.SGMLParser.__init__(self) self.title=None self.className=None self.href=None self.feed(body) def start_a(self, attrs): attrs = dict(attrs) if attrs.has_key('title'): self.title=attrs['title'] if attrs.has_key('href'): self.href=attrs['href'] if attrs.has_key('class'): self.className=attrs['class'] def registered(baseurl): from ConfigParser import ConfigParser for reg in glob(directory.registry+'*'): parser = ConfigParser() parser.read(reg) if parser.get('owner','id') == baseurl: return True def warn(entry, baseip, baseurl, basetext): warnings=[] rank = spamrank(baseip, baseurl, basetext) if rank > 2: warnings.append('block') elif rank > 1: rate = spamrank(baseip, baseurl, basetext, rateonly=1) if rate > 0: warnings.append('rate') if rank > rate*2: warnings.append('spam') elif rank > 0: # or baseurl.find('intertwingly') >= 0: warnings.append('captcha') try: if (time.time()-os.stat(directory.data+entry+'.txt').st_mtime) > 30*86400: warnings.append('age') except: pass return warnings def captcha(): import md5, os, random, re dict='/usr/share/dict/words' # remove any expired captchas expired = time.time()-3600.0 for name in glob(os.path.join(directory.captcha,'*')): try: if os.stat(name).st_mtime < expired: os.remove(name) except: pass # create a new captcha w6=[word for word in open(dict).read().split() if len(word)==6] word ='-' while not re.match("^[a-z]*$",word): word=w6[random.randrange(0,len(w6))] cmd='convert -size 160x45 xc:transparent -pointsize 36 -fill darkred ' + \ '''-draw "text 20,30 '%s'" %s%s.png''' hash = md5.new(word).hexdigest().strip() os.system(cmd % (word,directory.captcha,hash)) return (word,hash) def captcha_valid(word): import os,md5 hash = md5.new(word).hexdigest().strip()+'.png' if os.path.exists(directory.captcha+hash): try: os.unlink(directory.captcha+hash) except: pass return True def spamrank(baseip, baseurl, basetext, rateonly=0): credits, debits = spamtally(baseip, baseurl, basetext, rateonly) return max(len(debits) - len(credits), 0) def spamtally(baseip, baseurl, basetext, rateonly=0): # if baseurl and baseurl.find('intertwingly.net')>=0: return [[], []] # if baseurl and baseurl.find('diveintomark')>=0: return 0 # if baseurl and baseurl.find('redmonk.com/sogrady')>=0: return 0 # if baseurl and baseurl.find('torrez.us')>=0: return 0 # if baseurl and baseurl.find('2pauls.com')>=0: return 99 # if baseurl and baseurl.find('subasta.pl')>=0: return 99 # if baseurl and baseurl.find('einemillioneurohomepage.de')>=0: return 99 # overall parameters now=time.mktime(time.gmtime()) today=now-86400.0 three_days=now-3*86400.0 fifteen_minutes=now-300.0 recent=5 debits=[] credits=[] cmt_re = re.compile('\d+-(\d+)\.cmt$') if baseurl and baseurl.find('uggprovide.com')>=0: debits += 3*['spamword'] # gather up recent comments comments=[] for file in glob(directory.data+"*.cmt"): try: mtime=int((cmt_re.findall(file) or [os.stat(file).st_mtime])[0]) if mtime>=today: comments += [[mtime,file]] if mtime>=fifteen_minutes: recent+=1 except: pass # removed as spam? # if recent>20: debits += 10*['recent'] if len(comments)>100: debits += 100*['comments'] comments.sort() comments=comments[-recent:] # add in up 72 hours of spam for file in glob(directory.spam+"*.cmt"): mtime=os.stat(file).st_mtime if mtime>=three_days: comments += [[mtime,file]] # look for similarities for file in [file for (mtime,file) in comments]: data=open(file).read().decode('utf-8') if baseurl and link(data).href==baseurl: # multiple posts with the same URL debits.append('same-url') if link(data).className == 'openid': baseip='openid' elif baseip and link(data).title==baseip: # multiple posts from the same ip address debits.append('same-ip') elif '\n'.join(data.split('\n')[1:-1]) == basetext: # multiple posts of the same content import math # give a time based boost to this count debits.append('same-content') debits += int(math.sqrt((time.time()-mtime)/240))*['content-rate'] if rateonly: return credits, debits try: if type(basetext) == str: basetext = basetext.decode('utf-8') except: debits.append('encoding') basetext = basetext.decode('iso-8859-1') if basetext.find(' style="position: absolute; left: -')>=0: # lame attempt to hide debits.append('position') if basetext.find('" target="_blank>"')>=0: # lame popup window debits.append('popup') if len(basetext.split('" target="_blank>"'))>7: debits.append('multi-popup') if basetext and baseurl and basetext.find(baseurl)>=0: # trying too hard at self-promotion debits.append('self-promotion') if basetext and max([basetext.find(word) for word in spamwords])>=0: # druggies and gamblers debits.append('spamword') if basetext.find('[/URL]')>=0: # lame UBB code debits.append('UBB') if len(basetext.split('[/URL]'))>7: debits.append('multi-UBB') if basetext.find('[/url]')>=0: # lame UBB code debits.append('ubb') if len(basetext.split('[/url]'))>7: debits.append('multi-ubb') if basetext.find('[link]')>=0: # naked urls dropped into comments debits.append('link') if len(basetext.split('[link]'))>7: debits.append('multi-link') if basetext.find(' HREF=')>=0: # lame html debits.append('HREF') if len(basetext.split(' HREF='))>7: debits.append('multi-HREF') if basetext.find('<a href=http')>=0: # more lame html debits.append('unquoted-href') if len(basetext.split('<a href=http'))>7: debits.append('multi-unquoted-href') if len(debits) > 0 and baseip: import community if baseip == 'openid': # openid bonus credits.append('openid') if len(debits) > len(credits) and registered(baseurl): credits.append('registered') elif not community.find(baseip, baseurl): # stranger debits.append('stranger') return credits, debits def spammer(text): basetext='\n'.join(text.split('\n')[:-1]) sig = link(text) baseip=sig.title baseurl=sig.href if sig.className == 'openid': baseip=sig.className if spamrank(baseip, baseurl, basetext) > 2: tokens = spamtally(baseip, baseurl, basetext) import nonce nonce.generate(time.time()+86400, repr(['throttle traceback',tokens,text])) return True if __name__ == '__main__': # directory.data = '/home/rubys/web/intertwingly.net/spam/' import sys print spammer('' % sys.argv[1])