Always bold `CN`/`JA`/`KO` search terms (#928)
Add a function to check if target_word contains CJK characters If a search term contains Chinese, Japanese, or Korean characters, the term is bolded in search results regardless of whitespace. CJK characters: Chinese, Japanese (hiragana, katakana, kanji), and Korean (hangul syllables, hangul jamo) Co-authored-by: Ben Busby <contact@benbusby.com>main
parent
ccf9f06f2f
commit
e5a5aad997
|
@ -44,6 +44,27 @@ SITE_ALTS = {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def contains_cjko(s: str) -> bool:
|
||||||
|
"""This function check whether or not a string contains Chinese, Japanese,
|
||||||
|
or Korean characters. It employs regex and uses the u escape sequence to
|
||||||
|
match any character in a set of Unicode ranges.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
s (str): string to be checked
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the input s contains the characters and False otherwise
|
||||||
|
"""
|
||||||
|
unicode_ranges = ('\u4e00-\u9fff' # Chinese characters
|
||||||
|
'\u3040-\u309f' # Japanese hiragana
|
||||||
|
'\u30a0-\u30ff' # Japanese katakana
|
||||||
|
'\u4e00-\u9faf' # Japanese kanji
|
||||||
|
'\uac00-\ud7af' # Korean hangul syllables
|
||||||
|
'\u1100-\u11ff' # Korean hangul jamo
|
||||||
|
)
|
||||||
|
return bool(re.search(fr'[{unicode_ranges}]', s))
|
||||||
|
|
||||||
|
|
||||||
def bold_search_terms(response: str, query: str) -> BeautifulSoup:
|
def bold_search_terms(response: str, query: str) -> BeautifulSoup:
|
||||||
"""Wraps all search terms in bold tags (<b>). If any terms are wrapped
|
"""Wraps all search terms in bold tags (<b>). If any terms are wrapped
|
||||||
in quotes, only that exact phrase will be made bold.
|
in quotes, only that exact phrase will be made bold.
|
||||||
|
@ -66,12 +87,18 @@ def bold_search_terms(response: str, query: str) -> BeautifulSoup:
|
||||||
# Ensure target word is escaped for regex
|
# Ensure target word is escaped for regex
|
||||||
target_word = re.escape(target_word)
|
target_word = re.escape(target_word)
|
||||||
|
|
||||||
|
# Check if the word contains Chinese, Japanese, or Korean characters
|
||||||
|
if contains_cjko(target_word):
|
||||||
|
reg_pattern = fr'((?![{{}}<>-]){target_word}(?![{{}}<>-]))'
|
||||||
|
else:
|
||||||
|
reg_pattern = fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b'
|
||||||
|
|
||||||
if re.match('.*[@_!#$%^&*()<>?/\|}{~:].*', target_word) or (
|
if re.match('.*[@_!#$%^&*()<>?/\|}{~:].*', target_word) or (
|
||||||
element.parent and element.parent.name == 'style'):
|
element.parent and element.parent.name == 'style'):
|
||||||
return
|
return
|
||||||
|
|
||||||
element.replace_with(BeautifulSoup(
|
element.replace_with(BeautifulSoup(
|
||||||
re.sub(fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b',
|
re.sub(reg_pattern,
|
||||||
r'<b>\1</b>',
|
r'<b>\1</b>',
|
||||||
element,
|
element,
|
||||||
flags=re.I), 'html.parser')
|
flags=re.I), 'html.parser')
|
||||||
|
|
Loading…
Reference in New Issue