1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
|
--- filters.py 2024-11-03 19:21:06.476962281 +0100
+++ filters-patched.py 2024-11-03 19:27:01.703706898 +0100
@@ -53,7 +53,7 @@
\\S+? # URL body
(?=\\s|[.]\\s|$|[.]$) # Stop at space/end, or a dot followed by space/end
) | ( # or...
- \w+([:.]?\w+)* # word characters, with opt. internal colons/dots
+ \\w+([:.]?\\w+)* # word characters, with opt. internal colons/dots
)
""", verbose=True)
@@ -155,7 +155,7 @@
>>> f1 = LowercaseFilter()
>>> # In the other branch, we'll reverse the tokens
>>> f2 = ReverseTextFilter()
- >>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2)
+ >>> ana = RegexTokenizer(r"\\S+") | TeeFilter(f1, f2)
>>> [token.text for token in ana(target)]
["alfa", "AFLA", "bravo", "OVARB", "charlie", "EILRAHC"]
@@ -164,7 +164,7 @@
>>> f1 = PassFilter()
>>> f2 = BiWordFilter()
- >>> ana = RegexTokenizer(r"\S+") | TeeFilter(f1, f2) | LowercaseFilter()
+ >>> ana = RegexTokenizer(r"\\S+") | TeeFilter(f1, f2) | LowercaseFilter()
>>> [token.text for token in ana(target)]
["alfa", "alfa-bravo", "bravo", "bravo-charlie", "charlie"]
"""
|