Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 12 additions & 7 deletions src/publicsuffix2/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,9 +124,9 @@ def _find_node(self, parent, parts):
# if child already exists as a node, grab the sub-Trie
child_node = children.get(child, None)

# if it doesn't exist, creates a new node and initialized with [0]
# if it doesn't exist, creates a new node and initialized with [-1]
if not child_node:
children[child] = child_node = [0]
children[child] = child_node = [-1]

return self._find_node(child_node, parts)

Expand Down Expand Up @@ -222,7 +222,7 @@ def _lookup_node(self, matches, depth, parent, parts, wildcard):
# See: Algorithm 2 at https://publicsuffix.org/list/
matches[-depth] = 0

if parent in (0, 1):
if parent in (-1, 0, 1):
return

children = parent[1]
Expand All @@ -232,11 +232,12 @@ def _lookup_node(self, matches, depth, parent, parts, wildcard):
child = children.get(name, None)
if child is not None:
if wildcard or name != '*':
if child in (0, 1):
if child in (-1, 0, 1):
negate = child
else:
negate = child[0]
matches[-depth] = negate
if negate != -1:
matches[-depth] = negate
self._lookup_node(matches, depth + 1, child, parts, wildcard)

def get_sld(self, domain, wildcard=True, strict=False):
Expand Down Expand Up @@ -302,15 +303,19 @@ def get_tld(self, domain, wildcard=True, strict=False):
parts = domain.lower().strip('.').split('.')
hits = [None] * len(parts)
if strict and (
self.root in (0, 1) or parts[-1] not in self.root[1].keys()
self.root in (-1, 0, 1) or parts[-1] not in self.root[1].keys()
):
return None

self._lookup_node(hits, 1, self.root, parts, wildcard)

for i, what in enumerate(hits):
if what is not None and what == 0:
if what is None:
pass
elif what == 0:
return '.'.join(parts[i:])
elif what == 1:
return '.'.join(parts[i + 1:])


_PSL = None
Expand Down
19 changes: 13 additions & 6 deletions tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ def test_get_sld_exceptions(self):
def test_get_sld_no_wildcard(self):
psl = publicsuffix.PublicSuffixList()
# test completion when no wildcards should be processed
assert 'com.pg' == psl.get_sld('telinet.com.pg', wildcard=False)
expected = 'ap-southeast-1.elb.amazonaws.com'
assert 'pg' == psl.get_sld('telinet.com.pg', wildcard=False)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i don't agree with this one. when no wildcards are used (or totally regardless of the PSL at all), pg is a TLD. and so the sld is going to be one level up, i.e., 'com.pg'. It's never the case that 'pg' is an SLD.

expected = 'amazonaws.com'
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think this test was made when '*.amazonaws.com' was in the PSL -- so like Vadym's tests we should probably use a set list that isn't changing. In that case, same logic would happen for this. the wildcard goes away, and the item is processed as a suffix. I guess the readme isn't clear enough on this?

anyhow, currently this is not wildcarded in the PSL so it looks wrong, but the test should be on a fixed example input.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I checked and it seems that *amazonaws.com has never been on the publicsuffix list before.

$ git remote -v
origin  ssh://git@github.com/publicsuffix/list.git (fetch)
origin  ssh://git@github.com/publicsuffix/list.git (push)
$ git log -S'*.amazonaws.com' -p
$

result = psl.get_sld('blah.ap-southeast-1.elb.amazonaws.com', wildcard=False)
assert expected == result

Expand All @@ -226,20 +226,27 @@ def test_get_sld_top_convenience_function_is_the_same_as_PublicSuffixList_method
def test_get_tld_returns_correct_tld_or_etld(self):
psl = publicsuffix.PublicSuffixList()
assert 'com' == psl.get_tld('com')
assert 'ck' == psl.get_tld('ck')
assert 'kobe.jp' == psl.get_tld('city.kobe.jp')
assert 'kobe.jp' == psl.get_tld('kobe.jp')
assert 'amazonaws.com' == psl.get_tld('amazonaws.com')
assert 'jp' == psl.get_tld('kobe.jp')
assert 'com' == psl.get_tld('amazonaws.com')
Comment on lines +231 to +232
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Neither kobe.jp nor amazonaws.com is in https://publicsuffix.org/list/public_suffix_list.dat .

Copy link
Copy Markdown
Contributor

@KnitCode KnitCode Jul 22, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reordered for clarity...

*.kobe.jp was in the psl at some point, as are several subdomains of amazon.aws.
so these are good examples of the behavior you want to fix.

the public suffix list does change regularly, so we have to take some care that we aren't dependent on creating tests that depend on something we don't control. the behavior should be independent of the actual PSL. Also, as we noted in Vadym's pull request, some of the unit tests on the mozilla site were faulty (didn't follow the stated algorithm).

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

our tests shouldn't be reliant on moving data. we should use a fixed test file.
otherwise, agree... unless the wildcards are turned off, and then these become entries in the list.

Copy link
Copy Markdown
Contributor Author

@hiratara hiratara Jul 24, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

our tests shouldn't be reliant on moving data. we should use a fixed test file.

I perfectly agree with you. However, we should create another issue to solve that problem.

I found that we already use static data for testing.

https://github.com/nexB/python-publicsuffix2/blob/develop/src/publicsuffix2/public_suffix_list.dat

assert 'com.pg' == psl.get_tld('telinet.com.pg', wildcard=True)
assert 'pg' == psl.get_tld('telinet.com.pg', wildcard=False)
assert None == psl.get_tld('telinet.com.pg', wildcard=False)
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't use *.pg when we specify wildcard=False . Moreover, publicsuffix list doesn't contain pg .

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure i agree here. certain companies remove the wildcards in the PSL so they can reduce the variety of output. if *.pg is in the list, which it usually is, then, when they remove the wildcard, pg will be in the list. It doesn't remove the entry itself, just the wildcard.

we could argue that users shouldn't use the list this way (but they do), and i don't think we want to have a situation where a user puts in a legitimate domain and gets None for the tld. In this particular case, e.g., 'pg' is a real TLD, so it should never return None.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks to this, I now understand the semantics of wildcard=False. I fix the behavior on c6575d5 .

assert 'com.pg' == psl.get_tld('com.pg', wildcard=True)
assert 'pg' == psl.get_tld('com.pg', wildcard=False)
assert None == psl.get_tld('com.pg', wildcard=False)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should never return none for a valid TLD.

assert 'co.uk' == psl.get_tld('telinet.co.uk', wildcard=False)
assert 'co.uk' == psl.get_tld('co.uk', wildcard=True)
assert 'co.uk' == psl.get_tld('co.uk', wildcard=False)
assert None == psl.get_tld('blah.local', strict=True)
assert None == psl.get_tld('blah.local', wildcard=False)
assert 'local' == psl.get_tld('blah.local')

# See #18
assert 'git-pages.rit.edu' == psl.get_tld('git-pages.rit.edu')
assert 'edu' == psl.get_tld('rit.edu')
assert 'edu' == psl.get_tld('edu')
assert 'com' == psl.get_tld('cdn.fbsbx.com')

def test_get_tld_returns_correct_tld_or_etld_for_fqdn(self):
psl = publicsuffix.PublicSuffixList()
assert 'com' == psl.get_tld('www.foo.com.')
Expand Down