const NEW_LINE_CHARACTER = '\n'
const SPACE_CHARACTER = ' '

const COUNTED_PUNCTUATION_CHARACTERS = [
  '，', '。', ':', '：', ';', '；', '[', ']', '【', ']', '】', '{', '｛', '}', '｝',
  '(', '（', ')', '）', '<', '《', '>', '》', '$', '￥', '!', '！', '?', '？', '~', '～',
  '\'', '’', '"', '“', '”', '\'', '\'',
  '*', '/', '\\', '&', '%', '％', '@', '#', '^', '、','＞','＜'
]

const DIGIT_DELIMITERS = ['.', ',']
const ALL_PUNCTUATION = [NEW_LINE_CHARACTER, ...COUNTED_PUNCTUATION_CHARACTERS, ...DIGIT_DELIMITERS]
const ALL_PUNCTUATION_PATTERN = `[${ALL_PUNCTUATION.join('\\')}]`

// Match latin, cyrillic, Malayalam letters and numbers
const COMMON = '[0-9a-zA-Z\u00C0-\u00FF\u0100-\u017F\u0180-\u024F\u0250-\u02AF\u1E00-\u1EFF\u0400-\u04FF\u0500-\u052F\u0D00-\u0D7F]+|'
// Match Chinese Hànzì, the Japanese Kanji and the Korean Hanja
const CJK = '\u2E80-\u2EFF\u2F00-\u2FDF\u3000-\u303F\u31C0-\u31EF\u3200-\u32FF\u3300-\u33FF\u3400-\u3FFF\u4000-\u4DBF\u4E00-\u4FFF\u5000-\u5FFF\u6000-\u6FFF\u7000-\u7FFF\u8000-\u8FFF\u9000-\u9FFF\uF900-\uFAFF'
// Match Japanese Hiragana, Katakana, Rōmaji
const JP = '\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3190-\u319F'

export const japaneseLanguageRegExp = () => new RegExp('[' + CJK + JP + ']', 'g')

export const allPunctuationRegExp = () => new RegExp(ALL_PUNCTUATION_PATTERN, 'g')

export const replacePunctuation = (text: string, replaceValue = '') =>
  text.replace(allPunctuationRegExp(), replaceValue)

const punctuationSymbols = new Set([...ALL_PUNCTUATION, SPACE_CHARACTER])

export const isOnlyPunctuation = (text: string) => {
  return !!text.match(/^\s*$/) || !!text.match(new RegExp(`^${ALL_PUNCTUATION_PATTERN}$`))
}

export const tokenizeJapaneseTextByDelimiters = (text: string) => {
  const result = []
  let word = ''
  for (const char of text) {
    if (punctuationSymbols.has(char)) {
      if (word) {
        result.push(word)
        word = ''
      }
      result.push(char)
    } else {
      word += char
    }
  }
  if (word) result.push(word)
  return result
}

export const tokenizeJapaneseCharacters = (text: string): string[] => {
  if (!text) return []
  if (text.trim() === '') return []

  let normalizedText = replacePunctuation(text)
  normalizedText = normalizedText.replace(/[\uFF00-\uFFEF\u2000-\u206F]/g, '')
  normalizedText = normalizedText.replace(/\s+/g, ' ')

  const words = normalizedText.split(' ').filter(word => word.trim())

  const result: string[] = []

  const regExp = new RegExp(COMMON + '[' + CJK + JP + ']', 'g')
  words.forEach(word => {
    regExp.lastIndex = 0
    let match: RegExpExecArray | null = regExp.exec(word)

    let innerWordCount = 0
    while (match) {
      result.push(match[0])
      innerWordCount++
      match = regExp.exec(word)
    }

    if (!innerWordCount)
      result.push(word)
  })
  return result
}

export const countPunctuation = (text: string) => Array.from(text).reduce((count, char) => {
  return COUNTED_PUNCTUATION_CHARACTERS.includes(char) ? ++count : count
}, 0)

export const removeSpans = (html: string) => html.replace(/<\/?span[^>]*>/g, '')

export const removeAllTagsExceptBr = (html: string) => html.replace(/<(?!br)[^>]+>/g, '')

export const trimStartSpacesForText = (value: string) => value.replace(/^\s+(\S+)/g, '$1')