Always bold `CN`/`JA`/`KO` search terms (#928)

Add a function to check if target_word contains CJK characters If a search term contains Chinese, Japanese, or Korean characters, the term is bolded in search results regardless of whitespace. CJK characters: Chinese, Japanese (hiragana, katakana, kanji), and Korean (hangul syllables, hangul jamo) Co-authored-by: Ben Busby <contact@benbusby.com>
2023-01-09 20:54:41 +01:00 · 2023-01-09 20:54:41 +01:00 · e5a5aad997
parent ccf9f06f2f
commit e5a5aad997
1 changed files with 28 additions and 1 deletions
--- a/app/utils/results.py
+++ b/app/utils/results.py
@ -44,6 +44,27 @@ SITE_ALTS = {
 }


+def contains_cjko(s: str) -> bool:
+    """This function check whether or not a string contains Chinese, Japanese,
+    or Korean characters. It employs regex and uses the u escape sequence to
+    match any character in a set of Unicode ranges.
+
+    Args:
+        s (str): string to be checked
+
+    Returns:
+        bool: True if the input s contains the characters and False otherwise
+    """
+    unicode_ranges = ('\u4e00-\u9fff' # Chinese characters
+                      '\u3040-\u309f' # Japanese hiragana
+                      '\u30a0-\u30ff' # Japanese katakana
+                      '\u4e00-\u9faf' # Japanese kanji
+                      '\uac00-\ud7af' # Korean hangul syllables
+                      '\u1100-\u11ff' # Korean hangul jamo
+                      )
+    return bool(re.search(fr'[{unicode_ranges}]', s))
+
+
 def bold_search_terms(response: str, query: str) -> BeautifulSoup:
    """Wraps all search terms in bold tags (<b>). If any terms are wrapped
    in quotes, only that exact phrase will be made bold.
@ -66,12 +87,18 @@ def bold_search_terms(response: str, query: str) -> BeautifulSoup:
        # Ensure target word is escaped for regex
        target_word = re.escape(target_word)

+        # Check if the word contains Chinese, Japanese, or Korean characters
+        if contains_cjko(target_word):
+            reg_pattern = fr'((?![{{}}<>-]){target_word}(?![{{}}<>-]))'
+        else:
+            reg_pattern = fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b'
+
        if re.match('.*[@_!#$%^&*()<>?/\|}{~:].*', target_word) or (
                element.parent and element.parent.name == 'style'):
            return

        element.replace_with(BeautifulSoup(
-            re.sub(fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b',
+            re.sub(reg_pattern,
                   r'<b>\1</b>',
                   element,
                   flags=re.I), 'html.parser')