From e5a5aad997662187ee9310b2a3a41e83911a640f Mon Sep 17 00:00:00 2001
From: Ahmad Alkadri <ahmad.alkadri@outlook.com>
Date: Mon, 9 Jan 2023 20:54:41 +0100
Subject: [PATCH] Always bold `CN`/`JA`/`KO` search terms (#928)

Add a function to check if target_word contains CJK characters

If a search term contains Chinese, Japanese, or Korean characters,
the term is bolded in search results regardless of whitespace.

CJK characters: Chinese, Japanese (hiragana, katakana, kanji),
and Korean (hangul syllables, hangul jamo)

Co-authored-by: Ben Busby <contact@benbusby.com>
---
 app/utils/results.py | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)
diff --git a/app/utils/results.py b/app/utils/results.py
index cd4392d..640a93b 100644
--- a/app/utils/results.py
+++ b/app/utils/results.py
@@ -44,6 +44,27 @@ SITE_ALTS = {
 }
 
 
+def contains_cjko(s: str) -> bool:
+    """This function check whether or not a string contains Chinese, Japanese,
+    or Korean characters. It employs regex and uses the u escape sequence to
+    match any character in a set of Unicode ranges.
+
+    Args:
+        s (str): string to be checked
+
+    Returns:
+        bool: True if the input s contains the characters and False otherwise
+    """
+    unicode_ranges = ('\u4e00-\u9fff' # Chinese characters
+                      '\u3040-\u309f' # Japanese hiragana
+                      '\u30a0-\u30ff' # Japanese katakana
+                      '\u4e00-\u9faf' # Japanese kanji
+                      '\uac00-\ud7af' # Korean hangul syllables
+                      '\u1100-\u11ff' # Korean hangul jamo
+                      )
+    return bool(re.search(fr'[{unicode_ranges}]', s))
+
+
 def bold_search_terms(response: str, query: str) -> BeautifulSoup:
     """Wraps all search terms in bold tags (<b>). If any terms are wrapped
     in quotes, only that exact phrase will be made bold.
@@ -66,12 +87,18 @@ def bold_search_terms(response: str, query: str) -> BeautifulSoup:
         # Ensure target word is escaped for regex
         target_word = re.escape(target_word)
 
+        # Check if the word contains Chinese, Japanese, or Korean characters
+        if contains_cjko(target_word):
+            reg_pattern = fr'((?![{{}}<>-]){target_word}(?![{{}}<>-]))'
+        else:
+            reg_pattern = fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b'
+
         if re.match('.*[@_!#$%^&*()<>?/\|}{~:].*', target_word) or (
                 element.parent and element.parent.name == 'style'):
             return
 
         element.replace_with(BeautifulSoup(
-            re.sub(fr'\b((?![{{}}<>-]){target_word}(?![{{}}<>-]))\b',
+            re.sub(reg_pattern,
                    r'<b>\1</b>',
                    element,
                    flags=re.I), 'html.parser')