Skip to content

Commit

Permalink
Update our Unicode and legacy codepage support to Unicode 16.0.0
Browse files Browse the repository at this point in the history
We missed a couple of releases so this adds 11,131 new characters
compared to our previous version 13.0.0.  These are mostly obscure
alphabets and emojis (I'm just guessing) that are not super likely
to be included in any password.

There were also a few minor changes to definitions of categories such
as punctuation and specials - we always used the Unicode Database's
definition of them and apparently they changed their mind about a few.

Closes #5537
  • Loading branch information
magnumripper committed Nov 14, 2024
1 parent 8a4dc0d commit dc4d519
Show file tree
Hide file tree
Showing 13 changed files with 41,906 additions and 490 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ run/wpapcap2john
run/zip2john

src/.gdbinit
src/Unicode
src/aes/aes.a
src/arch.h
src/all_tests.lst
Expand Down
93 changes: 42 additions & 51 deletions run/dumb16.conf
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
# This software is Copyright (c) 2012-2020 magnum, and it is hereby
# This software is Copyright (c) 2012-2024 magnum, and it is hereby
# released to the general public under the following terms:
# Redistribution and use in source and binary forms, with or without
# modification, are permitted.
#
# Generic implementation of "dumb" exhaustive search of Unicode BMP.
# Default is to try *all* allocated characters in the BMP of Unicode v13
# (there's 55,387 of them). Even if a fast format can exhaust two characters
# Default is to try *all* allocated characters in the BMP of Unicode v16
# (there's 55,537 of them). Even if a fast format can exhaust two characters
# in 15 minutes, three characters would take 1.5 years...
#
# Note that these modes will handle --max-len differently than normal: They
Expand All @@ -22,7 +22,7 @@ int maxlength; // Maximum password length to try
int last; // Last character position, zero-based
int lastid; // Character index in the last position
int id[0x7f]; // Current character indices for other positions
int charset[0x10000], c0; // Characters
int charset[0xd900], c0; // Characters

void init()
{
Expand All @@ -43,7 +43,7 @@ void init()

/*
* This defines the character set. This is auto-generated from UnicodeData.txt
* and we skip control characters.
* of Unicode 16.0.0 and we skip control characters.
*/
i = 0;
// 0000..007F; Basic Latin
Expand Down Expand Up @@ -119,9 +119,6 @@ void init()
charset[i++] = c++;
// 0600..06FF; Arabic
c = 0x600; // from ARABIC NUMBER SIGN
while (c <= 0x61c) // ..to ARABIC LETTER MARK
charset[i++] = c++;
c = 0x61e; // from ARABIC TRIPLE DOT PUNCTUATION MARK
while (c <= 0x6ff) // ..to ARABIC LETTER HEH WITH INVERTED V
charset[i++] = c++;
// 0700..074F; Syriac
Expand Down Expand Up @@ -163,14 +160,17 @@ void init()
c = 0x860; // from SYRIAC LETTER MALAYALAM NGA
while (c <= 0x86a) // ..to SYRIAC LETTER MALAYALAM SSA
charset[i++] = c++;
// 08A0..08FF; Arabic Extended-A
c = 0x8a0; // from ARABIC LETTER BEH WITH SMALL V BELOW
while (c <= 0x8b4) // ..to ARABIC LETTER KAF WITH DOT BELOW
// 0870..089F; Arabic Extended-B
c = 0x870; // from ARABIC LETTER ALEF WITH ATTACHED FATHA
while (c <= 0x88e) // ..to ARABIC VERTICAL TAIL
charset[i++] = c++;
c = 0x8b6; // from ARABIC LETTER BEH WITH SMALL MEEM ABOVE
while (c <= 0x8c7) // ..to ARABIC LETTER LAM WITH SMALL ARABIC LETTER TAH ABOVE
charset[i++] = 0x890; // ARABIC POUND MARK ABOVE
charset[i++] = 0x891; // ARABIC PIASTRE MARK ABOVE
c = 0x897; // from ARABIC PEPET
while (c <= 0x89f) // ..to ARABIC HALF MADDA OVER MADDA
charset[i++] = c++;
c = 0x8d3; // from ARABIC SMALL LOW WAW
// 08A0..08FF; Arabic Extended-A
c = 0x8a0; // from ARABIC LETTER BEH WITH SMALL V BELOW
while (c <= 0x8ff) // ..to ARABIC MARK SIDEWAYS NOON GHUNNA
charset[i++] = c++;
// 0900..097F; Devanagari
Expand Down Expand Up @@ -360,7 +360,7 @@ void init()
c = 0xc2a; // from TELUGU LETTER PA
while (c <= 0xc39) // ..to TELUGU LETTER HA
charset[i++] = c++;
c = 0xc3d; // from TELUGU SIGN AVAGRAHA
c = 0xc3c; // from TELUGU SIGN NUKTA
while (c <= 0xc44) // ..to TELUGU VOWEL SIGN VOCALIC RR
charset[i++] = c++;
charset[i++] = 0xc46; // TELUGU VOWEL SIGN E
Expand Down Expand Up @@ -406,14 +406,16 @@ void init()
charset[i++] = c++;
charset[i++] = 0xcd5; // KANNADA LENGTH MARK
charset[i++] = 0xcd6; // KANNADA AI LENGTH MARK
charset[i++] = 0xcdd; // KANNADA LETTER NAKAARA POLLU
charset[i++] = 0xcde; // KANNADA LETTER FA
c = 0xce0; // from KANNADA LETTER VOCALIC RR
while (c <= 0xce3) // ..to KANNADA VOWEL SIGN VOCALIC LL
charset[i++] = c++;
c = 0xce6; // from KANNADA DIGIT ZERO
while (c <= 0xcef) // ..to KANNADA DIGIT NINE
charset[i++] = c++;
charset[i++] = 0xcf1; // KANNADA SIGN JIHVAMULIYA
charset[i++] = 0xcf2; // KANNADA SIGN UPADHMANIYA
charset[i++] = 0xcf3; // KANNADA SIGN COMBINING ANUSVARA ABOVE RIGHT
// 0D00..0D7F; Malayalam
c = 0xd00; // from MALAYALAM SIGN COMBINING ANUSVARA ABOVE
while (c <= 0xd0c) // ..to MALAYALAM LETTER VOCALIC L
Expand Down Expand Up @@ -483,7 +485,7 @@ void init()
while (c <= 0xec4) // ..to LAO VOWEL SIGN AI
charset[i++] = c++;
c = 0xec8; // from LAO TONE MAI EK
while (c <= 0xecd) // ..to LAO NIGGAHITA
while (c <= 0xece) // ..to LAO YAMAKKAN
charset[i++] = c++;
c = 0xed0; // from LAO DIGIT ZERO
while (c <= 0xed9) // ..to LAO DIGIT NINE
Expand Down Expand Up @@ -596,11 +598,9 @@ void init()
charset[i++] = c++;
// 1700..171F; Tagalog
c = 0x1700; // from TAGALOG LETTER A
while (c <= 0x170c) // ..to TAGALOG LETTER YA
charset[i++] = c++;
c = 0x170e; // from TAGALOG LETTER LA
while (c <= 0x1714) // ..to TAGALOG SIGN VIRAMA
while (c <= 0x1715) // ..to TAGALOG SIGN PAMUDPOD
charset[i++] = c++;
charset[i++] = 0x171f; // TAGALOG LETTER ARCHAIC RA
// 1720..173F; Hanunoo
c = 0x1720; // from HANUNOO LETTER A
while (c <= 0x1736) // ..to PHILIPPINE DOUBLE PUNCTUATION
Expand Down Expand Up @@ -629,9 +629,6 @@ void init()
charset[i++] = c++;
// 1800..18AF; Mongolian
c = 0x1800; // from MONGOLIAN BIRGA
while (c <= 0x180e) // ..to MONGOLIAN VOWEL SEPARATOR
charset[i++] = c++;
c = 0x1810; // from MONGOLIAN DIGIT ZERO
while (c <= 0x1819) // ..to MONGOLIAN DIGIT NINE
charset[i++] = c++;
c = 0x1820; // from MONGOLIAN LETTER A
Expand Down Expand Up @@ -704,14 +701,14 @@ void init()
charset[i++] = c++;
// 1AB0..1AFF; Combining Diacritical Marks Extended
c = 0x1ab0; // from COMBINING DOUBLED CIRCUMFLEX ACCENT
while (c <= 0x1ac0) // ..to COMBINING LATIN SMALL LETTER TURNED W BELOW
while (c <= 0x1ace) // ..to COMBINING LATIN SMALL LETTER INSULAR T
charset[i++] = c++;
// 1B00..1B7F; Balinese
c = 0x1b00; // from BALINESE SIGN ULU RICEM
while (c <= 0x1b4b) // ..to BALINESE LETTER ASYURA SASAK
while (c <= 0x1b4c) // ..to BALINESE LETTER ARCHAIC JNYA
charset[i++] = c++;
c = 0x1b50; // from BALINESE DIGIT ZERO
while (c <= 0x1b7c) // ..to BALINESE MUSICAL SYMBOL LEFT-HAND OPEN PING
c = 0x1b4e; // from BALINESE INVERTED CARIK SIKI
while (c <= 0x1b7f) // ..to BALINESE PANTI BAWAK
charset[i++] = c++;
// 1B80..1BBF; Sundanese
c = 0x1b80; // from SUNDANESE SIGN PANYECEK
Expand Down Expand Up @@ -739,7 +736,7 @@ void init()
charset[i++] = c++;
// 1C80..1C8F; Cyrillic Extended-C
c = 0x1c80; // from CYRILLIC SMALL LETTER ROUNDED VE
while (c <= 0x1c88) // ..to CYRILLIC SMALL LETTER UNBLENDED UK
while (c <= 0x1c8a) // ..to CYRILLIC SMALL LETTER TJE
charset[i++] = c++;
// 1C90..1CBF; Georgian Extended
c = 0x1c90; // from GEORGIAN MTAVRULI CAPITAL LETTER AN
Expand All @@ -765,9 +762,6 @@ void init()
charset[i++] = c++;
// 1DC0..1DFF; Combining Diacritical Marks Supplement
c = 0x1dc0; // from COMBINING DOTTED GRAVE ACCENT
while (c <= 0x1df9) // ..to COMBINING WIDE INVERTED BRIDGE BELOW
charset[i++] = c++;
c = 0x1dfb; // from COMBINING DELETION MARK
while (c <= 0x1dff) // ..to COMBINING RIGHT ARROWHEAD AND DOWN ARROWHEAD BELOW
charset[i++] = c++;
// 1E00..1EFF; Latin Extended Additional
Expand Down Expand Up @@ -831,7 +825,7 @@ void init()
charset[i++] = c++;
// 20A0..20CF; Currency Symbols
c = 0x20a0; // from EURO-CURRENCY SIGN
while (c <= 0x20bf) // ..to BITCOIN SIGN
while (c <= 0x20c0) // ..to SOM SIGN
charset[i++] = c++;
// 20D0..20FF; Combining Diacritical Marks for Symbols
c = 0x20d0; // from COMBINING LEFT HARPOON ABOVE
Expand Down Expand Up @@ -859,7 +853,7 @@ void init()
charset[i++] = c++;
// 2400..243F; Control Pictures
c = 0x2400; // from SYMBOL FOR NULL
while (c <= 0x2426) // ..to SYMBOL FOR SUBSTITUTE FORM TWO
while (c <= 0x2429) // ..to SYMBOL FOR DELETE MEDIUM SHADE FORM
charset[i++] = c++;
// 2440..245F; Optical Character Recognition
c = 0x2440; // from OCR HOOK
Expand Down Expand Up @@ -925,10 +919,7 @@ void init()
charset[i++] = c++;
// 2C00..2C5F; Glagolitic
c = 0x2c00; // from GLAGOLITIC CAPITAL LETTER AZU
while (c <= 0x2c2e) // ..to GLAGOLITIC CAPITAL LETTER LATINATE MYSLITE
charset[i++] = c++;
c = 0x2c30; // from GLAGOLITIC SMALL LETTER AZU
while (c <= 0x2c5e) // ..to GLAGOLITIC SMALL LETTER LATINATE MYSLITE
while (c <= 0x2c5f) // ..to GLAGOLITIC SMALL LETTER CAUDATE CHRIVI
charset[i++] = c++;
// 2C60..2C7F; Latin Extended-C
c = 0x2c60; // from LATIN CAPITAL LETTER L WITH DOUBLE BAR
Expand Down Expand Up @@ -989,7 +980,7 @@ void init()
charset[i++] = c++;
// 2E00..2E7F; Supplemental Punctuation
c = 0x2e00; // from RIGHT ANGLE SUBSTITUTION MARKER
while (c <= 0x2e52) // ..to TIRONIAN SIGN CAPITAL ET
while (c <= 0x2e5d) // ..to OBLIQUE HYPHEN
charset[i++] = c++;
// 2E80..2EFF; CJK Radicals Supplement
c = 0x2e80; // from CJK RADICAL REPEAT
Expand All @@ -1004,7 +995,7 @@ void init()
charset[i++] = c++;
// 2FF0..2FFF; Ideographic Description Characters
c = 0x2ff0; // from IDEOGRAPHIC DESCRIPTION CHARACTER LEFT TO RIGHT
while (c <= 0x2ffb) // ..to IDEOGRAPHIC DESCRIPTION CHARACTER OVERLAID
while (c <= 0x2fff) // ..to IDEOGRAPHIC DESCRIPTION CHARACTER ROTATION
charset[i++] = c++;
// 3000..303F; CJK Symbols and Punctuation
c = 0x3000; // from IDEOGRAPHIC SPACE
Expand Down Expand Up @@ -1039,8 +1030,9 @@ void init()
charset[i++] = c++;
// 31C0..31EF; CJK Strokes
c = 0x31c0; // from CJK STROKE T
while (c <= 0x31e3) // ..to CJK STROKE Q
while (c <= 0x31e5) // ..to CJK STROKE SZP
charset[i++] = c++;
charset[i++] = 0x31ef; // IDEOGRAPHIC DESCRIPTION CHARACTER SUBTRACTION
// 31F0..31FF; Katakana Phonetic Extensions
c = 0x31f0; // from KATAKANA LETTER SMALL KU
while (c <= 0x31ff) // ..to KATAKANA LETTER SMALL RO
Expand All @@ -1066,7 +1058,7 @@ void init()
charset[i++] = c++;
// 4E00..9FFF; CJK Unified Ideographs
c = 0x4e00; // from <CJK Ideograph, First>
while (c <= 0x9ffc) // ..to <CJK Ideograph, Last>
while (c <= 0x9fff) // ..to <CJK Ideograph, Last>
charset[i++] = c++;
// A000..A48F; Yi Syllables
c = 0xa000; // from YI SYLLABLE IT
Expand Down Expand Up @@ -1098,12 +1090,14 @@ void init()
charset[i++] = c++;
// A720..A7FF; Latin Extended-D
c = 0xa720; // from MODIFIER LETTER STRESS AND HIGH TONE
while (c <= 0xa7bf) // ..to LATIN SMALL LETTER GLOTTAL U
while (c <= 0xa7cd) // ..to LATIN SMALL LETTER S WITH DIAGONAL STROKE
charset[i++] = c++;
c = 0xa7c2; // from LATIN CAPITAL LETTER ANGLICANA W
while (c <= 0xa7ca) // ..to LATIN SMALL LETTER S WITH SHORT STROKE OVERLAY
charset[i++] = 0xa7d0; // LATIN CAPITAL LETTER CLOSED INSULAR G
charset[i++] = 0xa7d1; // LATIN SMALL LETTER CLOSED INSULAR G
c = 0xa7d5; // from LATIN SMALL LETTER DOUBLE WYNN
while (c <= 0xa7dc) // ..to LATIN CAPITAL LETTER LAMBDA WITH STROKE
charset[i++] = c++;
c = 0xa7f5; // from LATIN CAPITAL LETTER REVERSED HALF H
c = 0xa7f2; // from MODIFIER LETTER CAPITAL C
while (c <= 0xa7ff) // ..to LATIN EPIGRAPHIC LETTER ARCHAIC M
charset[i++] = c++;
// A800..A82F; Syloti Nagri
Expand Down Expand Up @@ -1258,19 +1252,16 @@ void init()
charset[i++] = c++;
// FB50..FDFF; Arabic Presentation Forms-A
c = 0xfb50; // from ARABIC LETTER ALEF WASLA ISOLATED FORM
while (c <= 0xfbc1) // ..to ARABIC SYMBOL SMALL TAH BELOW
while (c <= 0xfbc2) // ..to ARABIC SYMBOL WASLA ABOVE
charset[i++] = c++;
c = 0xfbd3; // from ARABIC LETTER NG ISOLATED FORM
while (c <= 0xfd3f) // ..to ORNATE RIGHT PARENTHESIS
charset[i++] = c++;
c = 0xfd50; // from ARABIC LIGATURE TEH WITH JEEM WITH MEEM INITIAL FORM
while (c <= 0xfd8f) // ..to ARABIC LIGATURE MEEM WITH KHAH WITH MEEM INITIAL FORM
charset[i++] = c++;
c = 0xfd92; // from ARABIC LIGATURE MEEM WITH JEEM WITH KHAH INITIAL FORM
while (c <= 0xfdc7) // ..to ARABIC LIGATURE NOON WITH JEEM WITH YEH FINAL FORM
charset[i++] = c++;
c = 0xfdf0; // from ARABIC LIGATURE SALLA USED AS KORANIC STOP SIGN ISOLATED FORM
while (c <= 0xfdfd) // ..to ARABIC LIGATURE BISMILLAH AR-RAHMAN AR-RAHEEM
while (c <= 0xfdff) // ..to ARABIC LIGATURE AZZA WA JALL
charset[i++] = c++;
// FE00..FE0F; Variation Selectors
c = 0xfe00; // from VARIATION SELECTOR-1
Expand Down
Loading

0 comments on commit dc4d519

Please sign in to comment.