| | 1 | #!/usr/bin/env python |
| | 2 | |
| | 3 | """ |
| | 4 | urlnorm.py - URL normalisation routines |
| | 5 | |
| | 6 | urlnorm normalises a URL by; |
| | 7 | * lowercasing the scheme and hostname |
| | 8 | * taking out default port if present (e.g., http://www.foo.com:80/) |
| | 9 | * collapsing the path (./, ../, etc) |
| | 10 | * removing the last character in the hostname if it is '.' |
| | 11 | * unquoting any %-escaped characters |
| | 12 | |
| | 13 | Available functions: |
| | 14 | norms - given a URL (string), returns a normalised URL |
| | 15 | norm - given a URL tuple, returns a normalised tuple |
| | 16 | test - test suite |
| | 17 | |
| | 18 | CHANGES: |
| | 19 | 0.92 - unknown schemes now pass the port through silently |
| | 20 | 0.91 - general cleanup |
| | 21 | - changed dictionaries to lists where appropriate |
| | 22 | - more fine-grained authority parsing and normalisation |
| | 23 | """ |
| | 24 | |
| | 25 | __license__ = """ |
| | 26 | Copyright (c) 1999-2002 Mark Nottingham <[email protected]> |
| | 27 | |
| | 28 | Permission is hereby granted, free of charge, to any person obtaining a copy |
| | 29 | of this software and associated documentation files (the "Software"), to deal |
| | 30 | in the Software without restriction, including without limitation the rights |
| | 31 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| | 32 | copies of the Software, and to permit persons to whom the Software is |
| | 33 | furnished to do so, subject to the following conditions: |
| | 34 | |
| | 35 | The above copyright notice and this permission notice shall be included in all |
| | 36 | copies or substantial portions of the Software. |
| | 37 | |
| | 38 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| | 39 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| | 40 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| | 41 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| | 42 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| | 43 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
| | 44 | SOFTWARE. |
| | 45 | """ |
| | 46 | |
| | 47 | __version__ = "0.93" |
| | 48 | |
| | 49 | from urlparse import urlparse, urlunparse |
| | 50 | from urllib import unquote |
| | 51 | from string import lower |
| | 52 | import re |
| | 53 | |
| | 54 | _collapse = re.compile('([^/]+/\.\./?|/\./|//|/\.$|/\.\.$)') |
| | 55 | _server_authority = re.compile('^(?:([^\@]+)\@)?([^\:]+)(?:\:(.+))?$') |
| | 56 | _default_port = { 'http': '80', |
| | 57 | 'https': '443', |
| | 58 | 'gopher': '70', |
| | 59 | 'news': '119', |
| | 60 | 'snews': '563', |
| | 61 | 'nntp': '119', |
| | 62 | 'snntp': '563', |
| | 63 | 'ftp': '21', |
| | 64 | 'telnet': '23', |
| | 65 | 'prospero': '191', |
| | 66 | } |
| | 67 | _relative_schemes = [ 'http', |
| | 68 | 'https', |
| | 69 | 'news', |
| | 70 | 'snews', |
| | 71 | 'nntp', |
| | 72 | 'snntp', |
| | 73 | 'ftp', |
| | 74 | 'file', |
| | 75 | '' |
| | 76 | ] |
| | 77 | _server_authority_schemes = [ 'http', |
| | 78 | 'https', |
| | 79 | 'news', |
| | 80 | 'snews', |
| | 81 | 'ftp', |
| | 82 | ] |
| | 83 | |
| | 84 | |
| | 85 | def norms(urlstring): |
| | 86 | """given a string URL, return its normalised form""" |
| | 87 | return urlunparse(norm(urlparse(urlstring))) |
| | 88 | |
| | 89 | |
| | 90 | def norm(urltuple): |
| | 91 | """given a six-tuple URL, return its normalised form""" |
| | 92 | (scheme, authority, path, parameters, query, fragment) = urltuple |
| | 93 | scheme = lower(scheme) |
| | 94 | if authority: |
| | 95 | userinfo, host, port = _server_authority.match(authority).groups() |
| | 96 | if host[-1] == '.': |
| | 97 | host = host[:-1] |
| | 98 | authority = lower(host) |
| | 99 | if userinfo: |
| | 100 | authority = "%s@%s" % (userinfo, authority) |
| | 101 | if port and port != _default_port.get(scheme, None): |
| | 102 | authority = "%s:%s" % (authority, port) |
| | 103 | if scheme in _relative_schemes: |
| | 104 | last_path = path |
| | 105 | while 1: |
| | 106 | path = _collapse.sub('/', path, 1) |
| | 107 | if last_path == path: |
| | 108 | break |
| | 109 | last_path = path |
| | 110 | path = unquote(path) |
| | 111 | return (scheme, authority, path, parameters, query, fragment) |
| | 112 | |
| | 113 | |
| | 114 | |
| | 115 | def test(): |
| | 116 | """ test suite; some taken from RFC1808. """ |
| | 117 | tests = { |
| | 118 | '/foo/bar/.': '/foo/bar/', |
| | 119 | '/foo/bar/./': '/foo/bar/', |
| | 120 | '/foo/bar/..': '/foo/', |
| | 121 | '/foo/bar/../': '/foo/', |
| | 122 | '/foo/bar/../baz': '/foo/baz', |
| | 123 | '/foo/bar/../..': '/', |
| | 124 | '/foo/bar/../../': '/', |
| | 125 | '/foo/bar/../../baz': '/baz', |
| | 126 | '/foo/bar/../../../baz': '/../baz', |
| | 127 | '/foo/bar/../../../../baz': '/baz', |
| | 128 | '/./foo': '/foo', |
| | 129 | '/../foo': '/../foo', |
| | 130 | '/foo.': '/foo.', |
| | 131 | '/.foo': '/.foo', |
| | 132 | '/foo..': '/foo..', |
| | 133 | '/..foo': '/..foo', |
| | 134 | '/./../foo': '/../foo', |
| | 135 | '/./foo/.': '/foo/', |
| | 136 | '/foo/./bar': '/foo/bar', |
| | 137 | '/foo/../bar': '/bar', |
| | 138 | '/foo//': '/foo/', |
| | 139 | '/foo///bar//': '/foo/bar/', |
| | 140 | 'http://www.foo.com:80/foo': 'http://www.foo.com/foo', |
| | 141 | 'http://www.foo.com:8000/foo': 'http://www.foo.com:8000/foo', |
| | 142 | 'http://www.foo.com./foo/bar.html': 'http://www.foo.com/foo/bar.html', |
| | 143 | 'http://www.foo.com.:81/foo': 'http://www.foo.com:81/foo', |
| | 144 | 'http://www.foo.com/%7ebar': 'http://www.foo.com/~bar', |
| | 145 | 'http://www.foo.com/%7Ebar': 'http://www.foo.com/~bar', |
| | 146 | 'ftp://user:[email protected]/foo/bar': 'ftp://user:[email protected]/foo/bar', |
| | 147 | 'http://USER:[email protected]/foo/bar': 'http://USER:[email protected]/foo/bar', |
| | 148 | 'http://www.example.com./': 'http://www.example.com/', |
| | 149 | '-': '-', |
| | 150 | } |
| | 151 | |
| | 152 | n_correct, n_fail = 0, 0 |
| | 153 | test_keys = tests.keys() |
| | 154 | test_keys.sort() |
| | 155 | for i in test_keys: |
| | 156 | print 'ORIGINAL:', i |
| | 157 | cleaned = norms(i) |
| | 158 | answer = tests[i] |
| | 159 | print 'CLEANED: ', cleaned |
| | 160 | print 'CORRECT: ', answer |
| | 161 | if cleaned != answer: |
| | 162 | print "*** TEST FAILED" |
| | 163 | n_fail = n_fail + 1 |
| | 164 | else: |
| | 165 | n_correct = n_correct + 1 |
| | 166 | print |
| | 167 | print "TOTAL CORRECT:", n_correct |
| | 168 | print "TOTAL FAILURE:", n_fail |
| | 169 | |
| | 170 | |
| | 171 | if __name__ == '__main__': |
| | 172 | test() |