1# Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html 2# 3# Copyright (C) 2006-2009, Google, International Business Machines Corporation and others. All Rights Reserved. 4# Regex for recognizing RFC 4646 well-formed tags 5# http://www.rfc-editor.org/rfc/rfc4646.txt 6# http://tools.ietf.org/html/draft-ietf-ltru-4646bis-21 7 8# The structure requires no forward references, so it reverses the order. 9# It uses Java/Perl syntax instead of the old ABNF 10# The uppercase comments are fragments copied from RFC 4646 11 12# Note: the tool requires that any real "=" or "#" or ";" in the regex be escaped. 13 14$alpha = [a-z] ; # ALPHA 15$digit = [0-9] ; # DIGIT 16$alphanum = [a-z 0-9] ; # ALPHA / DIGIT 17$x = x ; # private use singleton 18$singleton = [a-w y-z] ; # other singleton 19$s = [-_] ; # separator -- lenient parsers will use [-_] -- strict will use [-] 20 21# Now do the components. The structure is slightly different to allow for capturing the right components. 22# The notation (?:....) is a non-capturing version of (...): so the "?:" can be deleted if someone doesn't care about capturing. 23 24$language = $alpha{2,8} | $alpha{2,3} $s $alpha{3}; 25 26 # ABNF (2*3ALPHA) / 4ALPHA / 5*8ALPHA --- note: because of how | works in regex, don't use $alpha{2,3} | $alpha{4,8} 27 # We don't have to have the general case of extlang, because there can be only one extlang (except for zh-min-nan). 28 29# Note: extlang invalid in Unicode language tags 30 31$script = $alpha{4} ; # 4ALPHA 32 33$region = $alpha{2} | $digit{3} ; # 2ALPHA / 3DIGIT 34 35$variant = (?: $alphanum{5,8} | $digit $alphanum{3} ) ; # 5*8alphanum / (DIGIT 3alphanum) 36 37$extension = $singleton (?: $s $alphanum{2,8} )+ ; # singleton 1*("-" (2*8alphanum)) 38 39$privateUse = $x (?: $s $alphanum{1,8} )+ ; # "x" 1*("-" (1*8alphanum)) 40 41# Define certain legacy language tags (marked as “Type: grandfathered” in BCP 47), 42# since otherwise the regex is pretty useless. 43# Since these are limited, this is safe even later changes to the registry -- 44# the only oddity is that it might change the type of the tag, and thus 45# the results from the capturing groups. 46# http://www.iana.org/assignments/language-subtag-registry 47# Note that these have to be compared case insensitively, requiring (?i) below. 48 49$legacy = en $s GB $s oed 50 | i $s (?: ami | bnn | default | enochian | hak | klingon | lux | mingo | navajo | pwn | tao | tay | tsu ) 51 | no $s (?: bok | nyn ) 52 | sgn $s (?: BE $s (?: fr | nl) | CH $s de ) 53 | zh $s min $s nan; 54 55# old: | zh $s (?: cmn (?: $s Hans | $s Hant )? | gan | min (?: $s nan)? | wuu | yue ); 56# For well-formedness, we don't need the ones that would otherwise pass. 57# For validity, they need to be checked. 58 59# $legacyWellFormed = (?: 60# art $s lojban 61# | cel $s gaulish 62# | zh $s (?: guoyu | hakka | xiang ) 63# ); 64 65# Unicode locales: but we are shifting to a compatible form 66# $keyvalue = (?: $alphanum+ \= $alphanum+); 67# $keywords = ($keyvalue (?: \; $keyvalue)*); 68 69# We separate items that we want to capture as a single group 70 71$variantList = $variant (?: $s $variant )* ; # special for multiples 72$extensionList = $extension (?: $s $extension )* ; # special for multiples 73 74$langtag = (?: ( $language ) 75 (?: $s ( $script ) )? 40% 76 (?: $s ( $region ) )? 40% 77 (?: $s ( $variantList ) )? 10% 78 (?: $s ( $extensionList ) )? 5% 79 (?: $s ( $privateUse ) )? 5%); 80 81# Here is the final breakdown, with capturing groups for each of these components 82# The variants, extensions, legacy, and private-use may have interior '-' 83 84$root = (?i) # case-insensitive 85 (?: 86 $langtag 90% 87 | ( $privateUse ) 5% 88 | ( $legacy ) 5%) 89# (?: \@ $keywords )? 5% 90 ; 91