... get on my nerves for quite some time.
As far as I could find out they are used by Google Analytics and represent "The five dimensions of campaign tracking" (Source, Medium, Term, Content, Campaign).
As I have written in prior posts, I am not too fond of being tracked, so I wrote this little python snippet, to delete these information from URLs. I use it in a few of my gateway programs and it can be used eg. in an URL filter for the squid proxy (see redirect_program directive).
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
UTM_TAGS = (
u'utm_source',
u'utm_medium',
u'utm_term',
u'utm_content',
u'utm_campaign'
)
utm_re = list()
for ut in UTM_TAGS:
# unescaped form: &utm_name=value
utm_re.append(re.compile(u"([?&]"+ut+u"=[^&]*)"))
# escaped form: &utm_name=value
utm_re.append(re.compile(u"(&"+ut+u"=[^&]*)"))
def unUTM(url):
# check if the URL is parameterized
if 0 <= url.find("?"):
# double the parameter delimiter, so we won't lose it
url = url.replace("?", "??")
for p in utm_re:
url = p.sub(u"", url)
# clean up leftover mess
url = url.replace("??","?").replace("?&", "?").strip("?")
return(url)
Trying to incorporate BlackJack's comments led me to the following code:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
UTM_TAGS = (
u'utm_source',
u'utm_medium',
u'utm_term',
u'utm_content',
u'utm_campaign'
)
utm_re = re.compile(u''.join((
u'(^|&(amp;)?)(', # either at start or preceeded by '&' or '&'
u'|'.join(UTM_TAGS), # all tags as alternatives
u'=)[^&]*' # followed by '=' and all chars upto next '&'
)))
def unUTM(url):
# check if the URL is parameterized
if '?' in url:
(url, params) = url.split('?', 1)
params = utm_re.sub(u'', params)
if '' != params:
params = re.compile(u'^&(amp;)?').sub(u'', params)
url = u'?'.join((url, params))
return(url)
Armin 'argv[0]' Gruner has probably the most elegant solution that doesn't even use regex:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from urlparse import parse_qs, urlsplit, urlunsplit
from urllib import urlencode
from cgi import escape
def unUTM(url):
res = urlsplit(url)
if not res.query:
return url
qdict = parse_qs(res.query)
map(lambda key: qdict.pop(key), filter(lambda key: key.startswith('utm_'), qdict.keys()))
res = list(res))
res[3] = escape(urlencode(qdict, doseq=1))
return urlunsplit(res)
Comments