From e5a5aad997662187ee9310b2a3a41e83911a640f Mon Sep 17 00:00:00 2001 From: Ahmad Alkadri Date: Mon, 9 Jan 2023 20:54:41 +0100 Subject: [PATCH] Always bold `CN`/`JA`/`KO` search terms (#928) Add a function to check if target_word contains CJK characters If a search term contains Chinese, Japanese, or Korean characters, the term is bolded in search results regardless of whitespace. CJK characters: Chinese, Japanese (hiragana, katakana, kanji), and Korean (hangul syllables, hangul jamo) Co-authored-by: Ben Busby --- app/utils/results.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/app/utils/results.py b/app/utils/results.py index cd4392d..640a93b 100644 --- a/app/utils/results.py +++ b/app/utils/results.py @@ -44,6 +44,27 @@ SITE_ALTS = { } +def contains_cjko(s: str) -> bool: + """This function check whether or not a string contains Chinese, Japanese, + or Korean characters. It employs regex and uses the u escape sequence to + match any character in a set of Unicode ranges. + + Args: + s (str): string to be checked + + Returns: + bool: True if the input s contains the characters and False otherwise + """ + unicode_ranges = ('\u4e00-\u9fff' # Chinese characters + '\u3040-\u309f' # Japanese hiragana + '\u30a0-\u30ff' # Japanese katakana + '\u4e00-\u9faf' # Japanese kanji + '\uac00-\ud7af' # Korean hangul syllables + '\u1100-\u11ff' # Korean hangul jamo + ) + return bool(re.search(fr'[{unicode_ranges}]', s)) + + def bold_search_terms(response: str, query: str) -> BeautifulSoup: """Wraps all search terms in bold tags (). If any terms are wrapped in quotes, only that exact phrase will be made bold. @@ -66,12 +87,18 @@ def bold_search_terms(response: str, query: str) -> BeautifulSoup: # Ensure target word is escaped for regex target_word = re.escape(target_word) + # Check if the word contains Chinese, Japanese, or Korean characters + if contains_cjko(target_word): + reg_pattern = fr'((?![{{}}<>-]){target_word}(?![{{}}<>-]))' + else: + reg_pattern = fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b' + if re.match('.*[@_!#$%^&*()<>?/\|}{~:].*', target_word) or ( element.parent and element.parent.name == 'style'): return element.replace_with(BeautifulSoup( - re.sub(fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b', + re.sub(reg_pattern, r'\1', element, flags=re.I), 'html.parser')