| 1 | #!/usr/bin/env python |
| 2 | |
| 3 | """ |
| 4 | urlnorm.py - URL normalisation routines |
| 5 | |
| 6 | urlnorm normalises a URL by; |
| 7 | * lowercasing the scheme and hostname |
| 8 | * taking out default port if present (e.g., http://www.foo.com:80/) |
| 9 | * collapsing the path (./, ../, etc) |
| 10 | * removing the last character in the hostname if it is '.' |
| 11 | * unquoting any %-escaped characters |
| 12 | |
| 13 | Available functions: |
| 14 | norms - given a URL (string), returns a normalised URL |
| 15 | norm - given a URL tuple, returns a normalised tuple |
| 16 | test - test suite |
| 17 | |
| 18 | CHANGES: |
| 19 | 0.92 - unknown schemes now pass the port through silently |
| 20 | 0.91 - general cleanup |
| 21 | - changed dictionaries to lists where appropriate |
| 22 | - more fine-grained authority parsing and normalisation |
| 23 | """ |
| 24 | |
| 25 | __license__ = """ |
| 26 | Copyright (c) 1999-2002 Mark Nottingham <[email protected]> |
| 27 | |
| 28 | Permission is hereby granted, free of charge, to any person obtaining a copy |
| 29 | of this software and associated documentation files (the "Software"), to deal |
| 30 | in the Software without restriction, including without limitation the rights |
| 31 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 32 | copies of the Software, and to permit persons to whom the Software is |
| 33 | furnished to do so, subject to the following conditions: |
| 34 | |
| 35 | The above copyright notice and this permission notice shall be included in all |
| 36 | copies or substantial portions of the Software. |
| 37 | |
| 38 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 39 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 40 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 41 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 42 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 43 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| 44 | SOFTWARE. |
| 45 | """ |
| 46 | |
| 47 | __version__ = "0.93" |
| 48 | |
| 49 | from urlparse import urlparse, urlunparse |
| 50 | from urllib import unquote |
| 51 | from string import lower |
| 52 | import re |
| 53 | |
| 54 | _collapse = re.compile('([^/]+/\.\./?|/\./|//|/\.$|/\.\.$)') |
| 55 | _server_authority = re.compile('^(?:([^\@]+)\@)?([^\:]+)(?:\:(.+))?$') |
| 56 | _default_port = { 'http': '80', |
| 57 | 'https': '443', |
| 58 | 'gopher': '70', |
| 59 | 'news': '119', |
| 60 | 'snews': '563', |
| 61 | 'nntp': '119', |
| 62 | 'snntp': '563', |
| 63 | 'ftp': '21', |
| 64 | 'telnet': '23', |
| 65 | 'prospero': '191', |
| 66 | } |
| 67 | _relative_schemes = [ 'http', |
| 68 | 'https', |
| 69 | 'news', |
| 70 | 'snews', |
| 71 | 'nntp', |
| 72 | 'snntp', |
| 73 | 'ftp', |
| 74 | 'file', |
| 75 | '' |
| 76 | ] |
| 77 | _server_authority_schemes = [ 'http', |
| 78 | 'https', |
| 79 | 'news', |
| 80 | 'snews', |
| 81 | 'ftp', |
| 82 | ] |
| 83 | |
| 84 | |
| 85 | def norms(urlstring): |
| 86 | """given a string URL, return its normalised form""" |
| 87 | return urlunparse(norm(urlparse(urlstring))) |
| 88 | |
| 89 | |
| 90 | def norm(urltuple): |
| 91 | """given a six-tuple URL, return its normalised form""" |
| 92 | (scheme, authority, path, parameters, query, fragment) = urltuple |
| 93 | scheme = lower(scheme) |
| 94 | if authority: |
| 95 | userinfo, host, port = _server_authority.match(authority).groups() |
| 96 | if host[-1] == '.': |
| 97 | host = host[:-1] |
| 98 | authority = lower(host) |
| 99 | if userinfo: |
| 100 | authority = "%s@%s" % (userinfo, authority) |
| 101 | if port and port != _default_port.get(scheme, None): |
| 102 | authority = "%s:%s" % (authority, port) |
| 103 | if scheme in _relative_schemes: |
| 104 | last_path = path |
| 105 | while 1: |
| 106 | path = _collapse.sub('/', path, 1) |
| 107 | if last_path == path: |
| 108 | break |
| 109 | last_path = path |
| 110 | path = unquote(path) |
| 111 | return (scheme, authority, path, parameters, query, fragment) |
| 112 | |
| 113 | |
| 114 | |
| 115 | def test(): |
| 116 | """ test suite; some taken from RFC1808. """ |
| 117 | tests = { |
| 118 | '/foo/bar/.': '/foo/bar/', |
| 119 | '/foo/bar/./': '/foo/bar/', |
| 120 | '/foo/bar/..': '/foo/', |
| 121 | '/foo/bar/../': '/foo/', |
| 122 | '/foo/bar/../baz': '/foo/baz', |
| 123 | '/foo/bar/../..': '/', |
| 124 | '/foo/bar/../../': '/', |
| 125 | '/foo/bar/../../baz': '/baz', |
| 126 | '/foo/bar/../../../baz': '/../baz', |
| 127 | '/foo/bar/../../../../baz': '/baz', |
| 128 | '/./foo': '/foo', |
| 129 | '/../foo': '/../foo', |
| 130 | '/foo.': '/foo.', |
| 131 | '/.foo': '/.foo', |
| 132 | '/foo..': '/foo..', |
| 133 | '/..foo': '/..foo', |
| 134 | '/./../foo': '/../foo', |
| 135 | '/./foo/.': '/foo/', |
| 136 | '/foo/./bar': '/foo/bar', |
| 137 | '/foo/../bar': '/bar', |
| 138 | '/foo//': '/foo/', |
| 139 | '/foo///bar//': '/foo/bar/', |
| 140 | 'http://www.foo.com:80/foo': 'http://www.foo.com/foo', |
| 141 | 'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo', |
| 142 | 'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html', |
| 143 | 'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo', |
| 144 | 'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar', |
| 145 | 'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar', |
| 146 | 'ftp://user:[email protected]/foo/bar': 'ftp://user:[email protected]/foo/bar', |
| 147 | 'http://USER:[email protected]/foo/bar': 'http://USER:[email protected]/foo/bar', |
| 148 | 'http://www.example.com./': 'http://www.example.com/', |
| 149 | '-': '-', |
| 150 | } |
| 151 | |
| 152 | n_correct, n_fail = 0, 0 |
| 153 | test_keys = tests.keys() |
| 154 | test_keys.sort() |
| 155 | for i in test_keys: |
| 156 | print 'ORIGINAL:', i |
| 157 | cleaned = norms(i) |
| 158 | answer = tests[i] |
| 159 | print 'CLEANED: ', cleaned |
| 160 | print 'CORRECT: ', answer |
| 161 | if cleaned != answer: |
| 162 | print "*** TEST FAILED" |
| 163 | n_fail = n_fail + 1 |
| 164 | else: |
| 165 | n_correct = n_correct + 1 |
| 166 | print |
| 167 | print "TOTAL CORRECT:", n_correct |
| 168 | print "TOTAL FAILURE:", n_fail |
| 169 | |
| 170 | |
| 171 | if __name__ == '__main__': |
| 172 | test() |