dusk/tools/assetstool/processlanguage.py

import sys
import os
from args import args
from assetcache import assetCache, assetGetCache
from assethelpers import getAssetRelativePath
import polib
import re

LANGUAGE_CHUNK_CHAR_COUNT = 6 * 1024 # 6 KB per chunk

LANGUAGE_DATA = {}
LANGUAGE_KEYS = []

def processLanguageList():
  # Language keys header data
  headerKeys = "// Auto-generated language keys header file.\n"
  headerKeys += "#pragma once\n"
  headerKeys += "#include \"dusk.h\"\n\n"

  # This is the desired chunk groups list.. if a language key STARTS with any
  # of the keys in this list we would "like to" put it in that chunk group.
  # If there is no match, or the list is full then we will add it to the next
  # available chunk group (that isn't a 'desired' one). If the chunk becomes
  # full, then we attempt to make another chunk with the same prefix so that
  # a second batching can occur.
  desiredChunkGroups = {
    'ui': 0
  }

  # Now, for each language key, create the header reference and index.
  keyIndex = 0
  languageKeyIndexes = {}
  languageKeyChunk = {}
  languageKeyChunkIndexes = {}
  languageKeyChunkOffsets = {}
  for key in LANGUAGE_KEYS:
    headerKeys += f"#define {getLanguageVariableName(key)} {keyIndex}\n"
    languageKeyIndexes[key] = keyIndex
    keyIndex += 1

    # Find desired chunk group
    assignedChunk = None
    for desiredKey in desiredChunkGroups:
      if key.lower().startswith(desiredKey):
        assignedChunk = desiredChunkGroups[desiredKey]
        break
    # If no desired chunk group matched, assign to -1
    if assignedChunk is None:
      assignedChunk = -1
    languageKeyChunk[key] = assignedChunk

    # Setup header.
    for lang in LANGUAGE_DATA:
      if key not in LANGUAGE_DATA[lang]:
        print(f"Warning: Missing translation for key '{key}' in language '{lang}'")
        sys.exit(1)

  # Seal the header.
  headerKeys += f"\n#define LANG_KEY_COUNT {len(LANGUAGE_KEYS)}\n"

  # Now we can generate the language string chunks.
  nextChunkIndex = max(desiredChunkGroups.values()) + 1
  files = []

  for lang in LANGUAGE_DATA:
    langData = LANGUAGE_DATA[lang]

    # Key = chunkIndex, value = chunkInfo
    languageChunks = {}
    for key in LANGUAGE_KEYS:
      keyIndex = languageKeyIndexes[key]
      chunkIndex = languageKeyChunk[key]
      wasSetChunk = chunkIndex != -1

      # This will keep looping until we find a chunk
      while True:
        # Determine the next chunkIndex IF chunkIndex is -1
        if chunkIndex == -1:
          chunkIndex = nextChunkIndex

        # Is the chunk full?
        curLen = languageChunks.get(chunkIndex, {'len': 0})['len']
        newLen = curLen + len(langData[key])
        if newLen > LANGUAGE_CHUNK_CHAR_COUNT:
          # Chunk is full, need to create a new chunk.
          chunkIndex = -1
          if wasSetChunk:
            wasSetChunk = False
          else:
            nextChunkIndex += 1
          continue

        # Chunk is not full, we can use it.
        if chunkIndex not in languageChunks:
          languageChunks[chunkIndex] = {
            'len': 0,
            'keys': []
          }
        languageChunks[chunkIndex]['len'] = newLen
        languageChunks[chunkIndex]['keys'].append(key)
        languageKeyChunkIndexes[key] = chunkIndex
        languageKeyChunkOffsets[key] = curLen
        break

    # We have now chunked all the keys for this language!
    langBuffer = b""

    # Write header info
    langBuffer += b'DLF'  # Dusk Language File

    for key in LANGUAGE_KEYS:
      # Write the chunk that this key belongs to as uint32_t
      chunkIndex = languageKeyChunkIndexes[key]
      langBuffer += chunkIndex.to_bytes(4, byteorder='little')

      # Write the offset for this key as uint32_t
      offset = languageKeyChunkOffsets[key]
      langBuffer += offset.to_bytes(4, byteorder='little')

      # Write the length of the string as uint32_t
      strData = langData[key].encode('utf-8')
      langBuffer += len(strData).to_bytes(4, byteorder='little')

    # Now write out each chunk's string data, packed tight and no null term.
    for chunkIndex in sorted(languageChunks.keys()):
      chunkInfo = languageChunks[chunkIndex]
      for key in chunkInfo['keys']:
        strData = langData[key].encode('utf-8')
        langBuffer += strData

      # Now pad the chunk to full size
      curLen = chunkInfo['len']
      if curLen < LANGUAGE_CHUNK_CHAR_COUNT:
        padSize = LANGUAGE_CHUNK_CHAR_COUNT - curLen
        langBuffer += b'\0' * padSize

    # Write out the language data file
    outputFile = os.path.join(args.output_assets, "language", f"{lang}.dlf")
    files.append(outputFile)
    os.makedirs(os.path.dirname(outputFile), exist_ok=True)
    with open(outputFile, "wb") as f:
      f.write(langBuffer)


  # Write out the language keys header file
  outputFile = os.path.join(args.headers_dir, "locale", "language", "keys.h")
  os.makedirs(os.path.dirname(outputFile), exist_ok=True)
  with open(outputFile, "w") as f:
    f.write(headerKeys)

  return {
    'files': files
  }

def getLanguageVariableName(languageKey):
  # Take the language key, prepend LANG_, uppercase, replace any non symbols
  # with _
  key = languageKey.strip().upper()
  key = re.sub(r'[^A-Z0-9]', '_', key)
  return f"LANG_{key}"

def processLanguage(asset):
  cache = assetGetCache(asset['path'])
  if cache is not None:
    return cache

  # Load PO File
  po = polib.pofile(asset['path'])

  langName = po.metadata.get('Language')
  if langName not in LANGUAGE_DATA:
    LANGUAGE_DATA[langName] = {}

  for entry in po:
    key = entry.msgid
    val = entry.msgstr

    if key not in LANGUAGE_KEYS:
      LANGUAGE_KEYS.append(key)

    if key not in LANGUAGE_DATA[langName]:
      LANGUAGE_DATA[langName][key] = val
    else:
      print(f"Error: Duplicate translation key '{key}' in language '{langName}'")
      sys.exit(1)

  outLanguageData = {
    'data': po,
    'path': asset['path'],
    'files': []
  }
  return assetCache(asset['path'], outLanguageData)