Always bold `CN`/`JA`/`KO` search terms (#928)

Add a function to check if target_word contains CJK characters

If a search term contains Chinese, Japanese, or Korean characters,
the term is bolded in search results regardless of whitespace.

CJK characters: Chinese, Japanese (hiragana, katakana, kanji), 
and Korean (hangul syllables, hangul jamo)

Co-authored-by: Ben Busby <contact@benbusby.com>
main
Ahmad Alkadri 2023-01-09 20:54:41 +01:00 committed by GitHub
parent ccf9f06f2f
commit e5a5aad997
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 28 additions and 1 deletions

View File

@ -44,6 +44,27 @@ SITE_ALTS = {
} }
def contains_cjko(s: str) -> bool:
"""This function check whether or not a string contains Chinese, Japanese,
or Korean characters. It employs regex and uses the u escape sequence to
match any character in a set of Unicode ranges.
Args:
s (str): string to be checked
Returns:
bool: True if the input s contains the characters and False otherwise
"""
unicode_ranges = ('\u4e00-\u9fff' # Chinese characters
'\u3040-\u309f' # Japanese hiragana
'\u30a0-\u30ff' # Japanese katakana
'\u4e00-\u9faf' # Japanese kanji
'\uac00-\ud7af' # Korean hangul syllables
'\u1100-\u11ff' # Korean hangul jamo
)
return bool(re.search(fr'[{unicode_ranges}]', s))
def bold_search_terms(response: str, query: str) -> BeautifulSoup: def bold_search_terms(response: str, query: str) -> BeautifulSoup:
"""Wraps all search terms in bold tags (<b>). If any terms are wrapped """Wraps all search terms in bold tags (<b>). If any terms are wrapped
in quotes, only that exact phrase will be made bold. in quotes, only that exact phrase will be made bold.
@ -66,12 +87,18 @@ def bold_search_terms(response: str, query: str) -> BeautifulSoup:
# Ensure target word is escaped for regex # Ensure target word is escaped for regex
target_word = re.escape(target_word) target_word = re.escape(target_word)
# Check if the word contains Chinese, Japanese, or Korean characters
if contains_cjko(target_word):
reg_pattern = fr'((?![{{}}<>-]){target_word}(?![{{}}<>-]))'
else:
reg_pattern = fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b'
if re.match('.*[@_!#$%^&*()<>?/\|}{~:].*', target_word) or ( if re.match('.*[@_!#$%^&*()<>?/\|}{~:].*', target_word) or (
element.parent and element.parent.name == 'style'): element.parent and element.parent.name == 'style'):
return return
element.replace_with(BeautifulSoup( element.replace_with(BeautifulSoup(
re.sub(fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b', re.sub(reg_pattern,
r'<b>\1</b>', r'<b>\1</b>',
element, element,
flags=re.I), 'html.parser') flags=re.I), 'html.parser')