MediaWiki:Gadget-ImprovedUploadForm-TextCleaner.js

MediaWiki系统消息页面
/* eslint-disable no-control-regex */
/* global TextCleaner:true */
'use strict';

/* <nowiki> */
/**
 * SPDX-License-Identifier: CC-BY-SA-4.0
 * _addText: '{{Gadget Header|license=CC-BY-SA-4.0}}'
 *
 * @source <commons.wikimedia.org/wiki/MediaWiki:TextCleaner.js>
 */
/**
 * Wikitext sanitation for MediaWiki
 * Author: Lupo, January 2008
 * License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0)
 * Choose whichever license of these you like best :-)
 */
(function (mw) {
  window.TextCleaner = {
    imgNamespaceNames: null,
    // This function attempts to construct well-formed wikitext from input that may contain
    // possibly broken wikitext.
    //
    // Note: even just a half-baked sanitation of wikitext is hyper-complex due to the presence
    // of templates, and due to the fact that image thumbnail captions may themselves contain
    // links. This implementation catches the most common errors (such as forgetting to close a
    // template or a link), and even some more elaborate ones. With enough malice, this sanitation
    // can still be broken by user input such that the result is not well-formed wikitext as the
    // parser at the servers would like to have it. (It's still possible that the result is broken
    // wikitext, if the input was broken wikitext. But it never transforms well-formed wikitext
    // into broken wikitext.)
    //
    // If 'only_thumbs' is true, all [[Image: links are changed to [[:Image:, unless the original
    // image link was a thumbnail or had a width smaller than 300px specified.
    //
    // WARNING: do *not* attempt to use this to process large texts (e.g., a whole article). It is
    // probably rather inefficient due to the many substrings that are generated. This function is
    // primarily intended to be used to clean up user input in forms, which are typically rather
    // short.
    sanitizeWikiText: function sanitizeWikiText(input, only_thumbs) {
      if (input.search(/[[\]{}]|<nowiki(\s[^>]*)?>|<!--/) < 0) {
        return input;
      }
      // No critical characters
      if (!TextCleaner.imgNamespaceNames) {
        TextCleaner.imgNamespaceNames = [];
        var namespaceIds = mw.config.get('wgNamespaceIds');
        if (namespaceIds) {
          for (var name in namespaceIds) {
            if (namespaceIds[name] === 6) {
              // Image namespace
              TextCleaner.imgNamespaceNames.push(name);
            }
          }
        }
        // Make sure that we have the two canonical names
        TextCleaner.imgNamespaceNames.push('Image', 'File');
        // If your Wiki does not have wgNamespaceIds, add aliases or localized namespace names here!
      }

      var consumed = [0, 0];
      // For image captions. Image caption may contain links, and may even contain images.
      // The current MediaWiki parser actually allows this only once. For deeper recursions,
      // it fails. But here, it's actually easier to implement no limit.
      var base_regexp = new RegExp('[\\x01\\x02\\x03\\x04[\\]\\|\\x05\\x06\\x07\\x08]' + '|<nowiki(\\s[^>]*)?>|<!--', 'i'); // Ignore case
      var nowiki_regexp = new RegExp('<\\/nowiki(\\s[^>]*)?>|<!--', 'i');
      var allow_only_thumbs = only_thumbs;
      var sanitize = function sanitize(s, with_links, caption_level, allow_thumbs, break_at_pipe, with_tables, with_galleries) {
        if (!s || !s.length) {
          if (caption_level > 0) {
            if (consumed.length < caption_level) {
              consumed.push(0);
            } else {
              consumed[caption_level - 1] = 0;
            }
          }
          return s;
        }
        var result = '';
        var ch = '';
        var initial_length = s.length;
        var get_out = false;
        var in_nowiki = false;
        var endings = null;
        // Stack recording template and table nesting
        var next;
        var regexp = base_regexp;
        var push_end = function push_end(val) {
          if (!endings) {
            endings = [val];
          } else {
            endings.push(val);
          }
        };
        var pop_end = function pop_end() {
          if (!endings) {
            return null;
          } // Shouldn't happen
          var result;
          if (endings.length === 1) {
            result = endings[0];
            endings = null;
          } else {
            result = endings[endings.length - 1];
            endings.length--;
          }
          return result;
        };
        var get_initial = function get_initial(i, s) {
          for (var j = 0; j < TextCleaner.imgNamespaceNames.length; j++) {
            if (s.length >= i + TextCleaner.imgNamespaceNames[j].length + 1) {
              var t = s.substr(i, TextCleaner.imgNamespaceNames[j].length + 1);
              if (t.toLowerCase() === "".concat(TextCleaner.imgNamespaceNames[j].toLowerCase(), ":")) {
                return t;
              }
            }
          }
          return null;
        };
        while (s.length > 0 && !get_out) {
          next = s.search(regexp);
          if (next < 0) {
            result += s;
            break;
          }
          ch = s.charAt(next);
          var i = -1;
          var j = -1;
          var k = -1;
          switch (ch) {
            case '<':
              {
                // Nowiki or HTML comment. Must be closed.
                if (s.charAt(next + 1) === '!') {
                  // HTML comment. Cannot be nested.
                  i = s.indexOf('-->', next + 3);
                  if (i < 0) {
                    result += "".concat(s, "-->");
                    s = '';
                  } else {
                    result += s.substring(0, i + 3);
                    s = s.substring(i + 3);
                  }
                } else if (s.charAt(next + 1) === 'n') {
                  // Nowiki may contain HTML comments!
                  in_nowiki = true;
                  regexp = nowiki_regexp;
                  result += s.substring(0, next + 7);
                  s = s.substring(next + 7);
                } else {
                  // End of nowiki. Searched for and found only if in_nowiki === true
                  in_nowiki = false;
                  regexp = base_regexp;
                  i = s.indexOf('>', next + 1); // End of tag
                  result += s.substring(0, i + 1);
                  s = s.substring(i + 1);
                }
                break;
              }
            case "\x05":
              {
                // Table start
                if (!with_tables) {
                  result += s.substring(0, next);
                  get_out = true;
                  break;
                }
              }
            /* fall through */
            case "\x07":
              {
                if (ch === "\x07" && !with_galleries) {
                  result += s.substring(0, next);
                  get_out = true;
                  break;
                }
              }
            /* fall through */
            case "\x01":
              {
                // Start of template, table, or gallery
                result += s.substring(0, next + 1);
                push_end(String.fromCharCode(ch.charCodeAt(0) + 1).charAt(0));
                s = s.substring(next + 1);
                break;
              }
            case "\x06":
              {
                // Table end
                if (break_at_pipe && !endings) {
                  result += s.substring(0, next);
                  get_out = true;
                  break;
                }
              }
            /* fall through */
            case "\x02":
              {
                // End of a template or table
                result += s.substring(0, next);
                if (!endings || endings[endings.length - 1] !== ch) {
                  // Spurious template or table end
                  if (ch === "\x02") {
                    result += '&#x7D;&#x7D;';
                  } else {
                    result += '&#x7C;&#x7D;';
                  }
                } else {
                  result += pop_end();
                }
                s = s.substring(next + 1);
                break;
              }
            case "\b":
              {
                // End of gallery
                result += s.substring(0, next + 1);
                if (endings && endings[endings.length - 1] === ch) {
                  pop_end();
                }
                s = s.substring(next + 1);
                break;
              }
            case "\x03":
            case '[':
              {
                if (!with_links && !endings) {
                  get_out = true;
                  break;
                }
                // Image links must be treated specially, since they may contain nested links
                // in the caption!
                var initial = null; // If set, it's 'image:' or 'file:' and we have an image link
                i = next;
                while (i < s.length && s.charAt(i) === ch) {
                  i++;
                }
                if (ch === "\x03" && i < s.length && s.charAt(i) === '[') {
                  i++;
                }
                initial = get_initial(i, s);

                // Scan ahead. We'll break at the next top-level | or ] or ]] or [ or [[ or {| or |}
                var lk_text = sanitize(s.substring(i), false, caption_level + 1, false, true, false, false); // No galleries
                var lk_text_length = consumed[caption_level];
                j = i + lk_text_length;
                if (j >= s.length) {
                  // Used up the whole text: [[Foo or [bar
                  if (initial && allow_only_thumbs) {
                    // Should in any case have started with [[, not [
                    result += "".concat(s.substring(0, i - 1), "\x03:").concat(initial).concat(lk_text.substring(initial.length), "\x04");
                  } else {
                    result += s.substring(0, i) + lk_text + (s.charAt(i - 1) === '[' ? ']' : "\x04");
                  }
                  s = '';
                  break;
                }
                if (s.charAt(j) === '|') {
                  k = j;
                } else {
                  k = -1;
                }
                if (k < 0) {
                  // No pipe found: we should be on the closing ]] or ] or [[Foo]] or [bar]
                  if (initial && allow_only_thumbs) {
                    // Should in any case have started with [[, not [
                    result += "".concat(s.substring(0, i - 1), "\x03:").concat(initial).concat(lk_text.substring(initial.length), "\x04");
                  } else {
                    result += s.substring(0, i) + lk_text + (s.charAt(i - 1) === '[' ? ']' : "\x04");
                  }
                  if (s.charAt(j) === ']' || s.charAt(j) === "\x04") {
                    // Indeed closing the link
                    s = s.substring(j + 1);
                  } else {
                    s = s.substring(j);
                  }
                  break;
                } else {
                  var caption = null;
                  var used = 0;
                  // Pipe found.
                  if (!initial) {
                    // Not an image link. Must be something like [[Foo|Bar]].
                    caption = sanitize(s.substring(k + 1),
                    // No links, please
                    false, caption_level + 1,
                    // No thumbs either
                    false,
                    // Don't care about pipes
                    false,
                    // Allow tables (yes, parser allows that!)
                    true,
                    // Allow galleries (?)
                    true);
                    // Now we're at [[, [, ]], or ]
                    used = consumed[caption_level];
                    result += "".concat(s.substring(0, i) + lk_text, "|").concat(caption).concat(s.charAt(i - 1) === '[' ? ']' : "\x04");
                  } else {
                    var q = s.substring(k);
                    // We assume that there are no templates, nowikis, and other nasty things
                    // in the parameters. Search forward until the next [, {, ], }
                    var l = q.search(/[\u0001-\u0008[\]{}]/);
                    if (l < 0) {
                      l = q.length;
                    }
                    if (l + 1 < q.length) {
                      q = q.substring(0, l + 1);
                    }
                    var is_thumb = q.search(/\|\s*thumb(nail)?\s*[\u0004|]/) >= 0;
                    var img_width = /\|\s*(\d+)px\s*[\u0004|]/.exec(q);
                    if (img_width && img_width.length > 1) {
                      img_width = Number.parseInt(img_width[1], 10);
                      if (Number.isNaN(img_width)) {
                        img_width = null;
                      }
                    } else {
                      img_width = null;
                    }
                    if (!img_width) {
                      img_width = is_thumb ? 180 : 301;
                    }
                    var is_small = img_width <= 300;

                    // Caption starts at the last pipe before l. If that is a parameter,
                    // it doesn't hurt.
                    var m = k + q.lastIndexOf('|', l);
                    caption = sanitize(s.substring(m + 1),
                    // Allow links only if it's a thumb
                    is_thumb, caption_level + 1, allow_thumbs && is_thumb,
                    // Don't break at pipe
                    false,
                    // Tables only if it's a thumb
                    is_thumb,
                    // Allow galleries for thumbs (?)
                    is_thumb);
                    used = consumed[caption_level];
                    // caption used 'used' chars from m+1, s.charAt(m+1+used) === '\x04'
                    is_thumb = allow_thumbs && is_small;
                    if (is_thumb || !allow_only_thumbs) {
                      result += "".concat(s.substring(0, i - 1), "\x03").concat(lk_text);
                    } else {
                      result += "".concat(s.substring(0, i - 1), "\x03:").concat(initial).concat(lk_text.substring(initial.length));
                    }
                    result += "".concat(s.substring(k, m + 1) + caption, "\x04");
                    k = m;
                  }
                  next = k + 1 + used;
                  if (next < s.length) {
                    if (s.charAt(next) !== "\x04") {
                      s = s.substring(next);
                    } else {
                      s = s.substring(next + 1);
                    }
                  } else {
                    s = '';
                  }
                }
                break;
              }
            case "\x04":
            case ']':
              {
                // Extra bracket.
                result += s.substring(0, next);
                if (!caption_level && !break_at_pipe) {
                  result += ch === ']' ? '&#x5D;' : '&#x5D;&#x5D;';
                  s = s.substring(next + 1);
                } else {
                  get_out = true;
                }
                break;
              }
            case '|':
              {
                result += s.substring(0, next);
                if (break_at_pipe && !endings) {
                  // Pipe character at top level
                  get_out = true;
                } else {
                  if (!caption_level && !break_at_pipe && !endings) {
                    result += '&#x7C;'; // Top-level pipe character
                  } else {
                    result += '|';
                  }
                  s = s.substring(next + 1);
                }
                break;
              }
          } // end switch
        } // end while
        if (in_nowiki) {
          result += '</nowiki>';
        } // Make sure this nowiki is closed.

        // Close open templates and tables
        while (endings) {
          ch = pop_end();
          result += (ch === "\x06" ? '\n' : '') + ch;
        }
        if (caption_level > 0) {
          var used_up = initial_length - (get_out ? s.length - next : 0);
          if (consumed.length < caption_level) {
            consumed.push(used_up);
          } else {
            consumed[caption_level - 1] = used_up;
          }
        }
        return result;
      };

      // Replace multi-character tokens by one-character placeholders, simplifying the
      // subsequent processing.
      var s = input.replace(/{{/g, "\x01").replace(/\n\s*\|}}}/g, "\n\x06\x02") // Table end + template end
      .replace(/}}/g, "\x02").replace(/\[\[/g, "\x03").replace(/]]/g, "\x04").replace(/\n\s*{\|/g, "\n\x05") // Table start and end must be on own line
      .replace(/^\s*{\|/, "\x05") // Table start at the very beginning
      .replace(/\n\s*\|}/g, "\n\x06") // (we strip leading whitespace)
      .replace(/<\s*gallery\s*>/g, "\x07").replace(/<\/\s*gallery\s*>/g, "\b");
      s = sanitize(s, true, 0, true, false, true, true);
      // with links, allow thumbs, don't break at pipe, allow tables, allow galleries
      return s.replace(/\u0001/g, '{{').replace(/\u0002/g, '}}').replace(/\u0003/g, '[[').replace(/\u0004/g, ']]').replace(/\u0005/g, '{|').replace(/\u0006/g, '|}').replace(/\u0007/g, '<gallery>').replace(/\u0008/g, '</gallery>');
    }
  };
})(mediaWiki);

/* </nowiki> */