zsh/Functions/Zle/insert-composed-char

# Accented characters.  Inputs two keys.  There are two types: those
# with a base character followed by an accent (see below for codes for
# accents), and those with a two-character mnemonic for the composed
# character.  These are (with the exception of the Euro) the codes
# given by RFC 1345.  Note that some codes in RFC 1345 require three
# characters to be input; none of these are handled.
#
# For best results zsh should have been built with support for
# multibyte characters (--enable-multibyte), but single character sets
# also work.
#
# Outputs the character converted from Unicode into the local representation.
# (The conversion is done within the shell, using whatever facilities
# the C library provides.)
#
# When used as a zle widget, the character is inserted at the cursor
# position.  With a numeric argument, preview in status line; outside zle,
# print character (and newline) to standard output.
#
# The set of accented characters is reasonably complete up to U+0180, the
# set of special characters less so.  However, it mostly gives up at that
# point.  Adding new Unicode characters is easy, however.  Please send any
# additions to zsh-workers@sunsite.dk .
#
# Some of the accent codes are a little more obscure than others.
#  !   Grave
#  '   Acute
#  >   Circumflex
#  ?   Tilde
#  -   Macron.  (A horizonal bar over the letter.)
#  (   Breve.  (A shallow dish shape over the letter.)
#  .   Dot above, or no dot with lower case i, or dot in the middle of L or l.
#  :   Diaeresis (Umlaut)
#  ,   Cedilla
#  _   Underline (none of these currently)
#  /   Stroke through character
#  "   Double acute
#  ;   Ogonek.  (A little forward facing hook at the bottom right
#      of the character.)
#  <   Caron.  (A little v over the letter.)
#  0   Circle
#  2   Hook
#  9   Horn
# Hence A! is upper case A with a grave, c, is lower case c with cedilla.
#
# Some other composed charaters:
# Various ligatures:
#  AE ae OE oe IJ ij
#
# ASCII characters not on all keyboards:
#  <(           [
#  //           \
#  )>           ]
#  (!           {
#  !!           |
#  !)           }
#  '?           ~
#
# Special letters:
#  ss		Eszett (schafes S)
#  D- d- TH th  Eth and thorn
#  kk           kra
#  'n           'n
#  NG ng        ng
#  OI oi        OI
#  yr           yr
#  ED           ezh
#
# Currency symbols:
#  Ct           Cent
#  Pd           Pound sterling
#  Cu           Currency
#  Ye           Yen
#  Eu           Euro (not in RFC 1345 but logical)
#
# Punctuation
#  !I           Inverted !
#  BB           Broken vertical bar
#  SE           Section
#  Co           Copyright
#  -a           Spanish feminine ordinal indicator
#  <<           Left guillemet
#  --           Soft hyphen
#  Rg           Registered trade mark
#  PI           Pilcrow (paragraph)
#  -o           Spanish masculine ordinal indicator
#  >>           Right guillemet
#  ?I           Inverted question mark
#  '6           Left single quote
#  '9           Right single quote
#  .9           "Right" low quote
#  9'           Reversed "right" quote
#  "6           Left double quote
#  "9           Right double quote
#  :9           "Right" low double quote
#  9"           Reversed "right" double quote
#  /-           Dagger
#  /=           Double dagger
#
# Mathematical
#  DG           Degree
#  +-           +/-
#  2S           Superscript 2
#  3S           Superscript 3
#  My           Micro
#  .M           Middle dot
#  1S           Superscript 1
#  14           Quarter
#  12           Half
#  34           Three quarters
#  *X           Multiplication
#  -:           Division
#
# Accents with no base character
# '>            Circumflex (caret)
# '!            Grave (backtick)
# ',            Cedilla
# ':            Diaeresis (Umlaut)
# 'm            Macron
# ''            Acute

emulate -LR zsh
setopt cbases extendedglob printeightbit

local accent basechar ochar error

if [[ -n $WIDGET ]]; then
  error=(zle -M)
else
  error=print
fi

if (( ${+zsh_accented_chars} == 0 )); then
  # The associative array zsh_accent_chars is indexed by the
  # accent.  The values are sets of character / Unicode pairs for
  # the character with the given accent.  The Unicode value is
  # a hex index with no base discriminator; essentially a UCS-4 index
  # with the leading zeroes suppressed.
  typeset -gA zsh_accented_chars

  # grave
  accent=\!
  zsh_accented_chars[$accent]="\
A C0 E C8 I CC O D2 U D9 a E0 e E8 i EC o F2 u F9 N 1F8 n 1F9 \
"
  # acute
  accent=\'
  zsh_accented_chars[$accent]="\
A C1 E C9 I CD O D3 U DA Y DD a E1 e E9 i EC o F3 u FA y FD C 106 c 107 \
L 139 l 13A N 143 n 144 R 154 r 155 S 15A s 15B Z 179 z 17A \
"
  # circumflex
  accent=\>
  zsh_accented_chars[$accent]="\
A C2 E CA I CE O D4 U DB a E2 e EA i EE o F4 u FB C 108 c 109 G 11C g 11d \
H 124 h 125 J 134 j 135 S 15C s 15D W 174 w 175 Y 176 y 177 \
"
  # tilde
  accent=\?
  zsh_accented_chars[$accent]="\
A C3 E CB N D1 O D5 a E3 n F1 o F5 I 128 i 129 U 168 u 169 \
"
  # macron (d-, D- give eth)
  accent=-
  zsh_accented_chars[$accent]="\
A 100 a 101 d F0 D D0 E 112 e 113 I 12a i 12b O 14C o 14D U 16A u 16B \
"
  # breve
  accent=\(
  zsh_accented_chars[$accent]="\
A 102 a 103 E 114 e 115 G 11E g 11F I 12C i 12D O 14E o 14F U 16C u 16D \
"
  # dot above, small i with no dot, or l with middle dot
  accent=.
  zsh_accented_chars[$accent]="\
C 10A c 10b E 116 e 117 G 120 g 121 I 130 i 131 L 13F l 140 Z 17B z 17C \
"
  # diaeresis / Umlaut
  accent=:
  zsh_accented_chars[$accent]="\
A C4 I CF O D6 U DC a E4 e EB i EF o F6 u FC y FF Y 178 \
"
  # cedilla
  accent=,
  zsh_accented_chars[$accent]="\
C C7 c E7 G 122 g 123 K 136 k 137 L 13B l 13C N 145 n 146 R 156 r 157 \
S 15E s 15F T 162 t 163 \
"
  # underline (_) would go here
  # stroke through
  accent=/
  zsh_accented_chars[$accent]="\
O D8 o F8 D 110 d 111 H 126 h 127 L 141 l 142 T 166 t 167 b 180 \
"
  # double acute
  accent=\"
  zsh_accented_chars[$accent]="\
O 150 o 151 U 170 u 171\
"
  # ogonek
  accent=\;
  zsh_accented_chars[$accent]="\
A 104 a 105 E 118 e 119 I 12E i 12F U 172 u 173 \
"
  # caron
  accent=\<
  zsh_accented_chars[$accent]="\
C 10C c 10D D 10E d 10F E 11A e 11B L 13D l 13E N 147 n 148 R 158 r 159 \
S 160 s 161 T 164 t 165 Z 17D z 17E \
"
  # ring above
  accent=0
  zsh_accented_chars[$accent]="\
A C5 a E5 U 16E u 16F \
"
  # hook above
  accent=2
  zsh_accented_chars[$accent]="\
A 1EA2 a 1EA3 E 1EBA e 1EBA \
"
  # horn, also right quotation marks
  accent=9
  zsh_accented_chars[$accent]="\
O 1A0 o 1A1 U 1Af u 1b0 ' 2019 . 201A \" 201D : 201E \
"
  # left quotation marks
  accent=6
  zsh_accented_chars[$accent]="\
' 2018 \" 201C \
"
  # reversed quotation marks for convenience
  accent=\'
  zsh_accented_chars[$accent]+=" \
9 201B \
"
  accent=\"
  zsh_accented_chars[$accent]+=" \
9 201F \
"

  # ligature with E
  accent=e
  zsh_accented_chars[$accent]="\
A C6 O 152 \
"
  # ligature with e
  accent=e
  zsh_accented_chars[$accent]="\
a E6 o 153 \
"
  # ligature with J
  accent=J
  zsh_accented_chars[$accent]="\
I 132 \
"
  # ligature with j
  accent=j
  zsh_accented_chars[$accent]="\
i 133 \
"
  # eszett
  accent=s
  zsh_accented_chars[$accent]="\
s DF \
"
  # upper case thorn
  accent=H
  zsh_accented_chars[$accent]="\
T DE \
"
  # lower case thorn
  accent=h
  zsh_accented_chars[$accent]="\
t FE \
"

  # Remaining characters are handled as separate pairs.
  # We need to remember that the assoc array is keyed by the second character.
  # Left square bracket
  accent=\(
  zsh_accented_chars[$accent]+=" < 5B"
  # Reverse solidus (backslash to you and me).
  accent=/
  zsh_accented_chars[$accent]+=" / 5C"
  # Right square bracket, circumflex
  accent=\>
  zsh_accented_chars[$accent]+=" ) 5D ' 5E"
  # Grave accent
  accent=\!
  zsh_accented_chars[$accent]+=" ' 60"
  # diglyphys for (usually) standard characters {, |, }, ~
  accent=\!
  zsh_accented_chars[$accent]+=" ( 7B"
  zsh_accented_chars[$accent]+=" ! 7C"
  accent=\)
  zsh_accented_chars[$accent]+=" ! 7D"
  accent=\?
  zsh_accented_chars[$accent]+=" ' 7E"
  # non-breaking space
  zsh_accented_chars[S]+=" N A0"
  # inverted exclamation mark
  zsh_accented_chars[I]+=" ! A1"
  # cent
  zsh_accented_chars[t]+=" C A2"
  # pound sterling
  zsh_accented_chars[d]+=" P A3"
  # currency
  zsh_accented_chars[u]+=" C A4"
  # yen
  zsh_accented_chars[e]+=" Y A5"
  # broken bar
  zsh_accented_chars[B]+=" B A6"
  # section
  zsh_accented_chars[E]+=" S A7"
  # lonely diaeresis
  zsh_accented_chars[:]+=" ' A8"
  # copyright
  zsh_accented_chars[o]+=" C A9"
  # spanish feminine ordinal
  zsh_accented_chars[a]+=" - AA"
  # left guillemet
  accent=\<
  zsh_accented_chars[$accent]+=" < AB"
  zsh_accented_chars[O]+=" N AC"
  # soft hyphen
  zsh_accented_chars[-]+=" - AD"
  # registered
  zsh_accented_chars[g]+=" R AE"
  # lonely macron
  zsh_accented_chars[m]+=" ' AF"
  # degree
  zsh_accented_chars[G]+=" D B0"
  # +/-
  zsh_accented_chars[-]+=" + B1"
  # superscripts
  zsh_accented_chars[S]+=" 2 B2 3 B3"
  # lonely acute
  accent=\'
  zsh_accented_chars[$accent]+=" ' B4"
  # micro
  zsh_accented_chars[y]+=" M B5"
  # pilcrow (paragraph)
  zsh_accented_chars[I]+=" P B6"
  # Middle dot
  zsh_accented_chars[M]+=" . B7"
  # Lonely cedilla
  zsh_accented_chars[,]+=" ' B8"
  # Superscript one
  zsh_accented_chars[S]+=" 1 B9"
  # spanish masculine ordinal
  zsh_accented_chars[o]+=" - BA"
  # right guillemet
  accent=\>
  zsh_accented_chars[$accent]+=" > BB"
  # fractions
  zsh_accented_chars[4]+=" 1 BC 3 BE"
  zsh_accented_chars[2]+=" 1 BD"
  # inverted question mark
  zsh_accented_chars[I]+=" ? BF"
  # multiplication
  zsh_accented_chars[X]+=" * D7"
  # division
  zsh_accented_chars[:]+=" - F7"
  # kra
  zsh_accented_chars[k]+=" k 138"
  # apostrophe n
  zsh_accented_chars[n]+=" ' 149"
  # Lappish ng
  zsh_accented_chars[G]+=" N 14A"
  zsh_accented_chars[g]+=" n 14B"
  # OI
  zsh_accented_chars[I]+=" O 1A2"
  zsh_accented_chars[i]+=" o 1A3"
  # yr
  zsh_accented_chars[r]+=" y 1A6"
  # ezh
  zsh_accented_chars[D]+=" E 1B7"
  # euro (I invented this but it's logical)
  zsh_accented_chars[u]+=" E 20AC"
  # dagger and double dagger
  zsh_accented_chars[-]+=" / 2020"
  zsh_accented_chars[=]+=" / 2021"
fi

read -k basechar || return 1
read -k accent || return 1

local -A charmap
charmap=(${=zsh_accented_chars[$accent]})

if [[ ${#charmap} -eq 0 || -z $charmap[$basechar] ]]; then
  $error "Combination ${basechar}${accent} is not available."
  return 1
fi

if [[ -z $WIDGET ]]; then
  [[ -t 1 ]] && print
  print "\U${(l.8..0.)charmap[$basechar]}"
else
  ochar="$(print -n "\U${(l.8..0.)charmap[$basechar]}")"

  if (( ${+NUMERIC} )); then
    $error "Character ${(l.8..0.)charmap[$basechar]}: $ochar"
  else
    LBUFFER+=$ochar
  fi
fi