MediaWiki:Gadget-ImprovedUploadForm-TextCleaner.js

请注意：更新本页面后，您可能需要清除浏览器缓存才能看到所作变更的影响。
Google Chrome、Microsoft Edge、Mozilla Firefox、Safari：按住⇧ Shift，同时单击“刷新”。
本小工具的部分或全部代码来源于求闻百科源代码导入，在导入后由有兽档案馆技术团队维护于Git仓库内。
对本页面内容的变更，应通知技术团队，以免在代码部署时被误覆盖。
本小工具代码采用下列版权许可：
本文件采用知识共享署名-相同方式共享 4.0 国际（CC BY-SA 4.0）许可协议授权。
您可以自由地：
分享 – 在任何媒介以任何形式复制、发行本作品
演绎 – 修改、转换或以本作品为基础进行创作
惟须遵守下列条件：
署名 – 您必须给出适当的署名（标明原作者），提供指向本许可协议的链接，同时标明是否（对原始作品）作了修改。您可以用任何合理的方式来署名，但是不得以任何方式暗示许可人为您或您的使用背书（即诱导原作者在不了解协议情况下给你授权）。
相同方式共享 – 如果您再混合、转换或者基于本作品进行创作，您必须基于与原先许可协议相同或相兼容的许可协议分发您贡献的作品。
CC BY-SA 4.0 Creative Commons Attribution-ShareAlike 4.0 truetrue//creativecommons.org/licenses/by-sa/4.0
许可证条款文本
/* eslint-disable no-control-regex */
/* global TextCleaner:true */
'use strict';

/* <nowiki> */
/**
 * SPDX-License-Identifier: CC-BY-SA-4.0
 * _addText: '{{Gadget Header|license=CC-BY-SA-4.0}}'
 *
 * @source <commons.wikimedia.org/wiki/MediaWiki:TextCleaner.js>
 */
/**
 * Wikitext sanitation for MediaWiki
 * Author: Lupo, January 2008
 * License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0)
 * Choose whichever license of these you like best :-)
 */
(function (mw) {
  window.TextCleaner = {
    imgNamespaceNames: null,
    // This function attempts to construct well-formed wikitext from input that may contain
    // possibly broken wikitext.
    //
    // Note: even just a half-baked sanitation of wikitext is hyper-complex due to the presence
    // of templates, and due to the fact that image thumbnail captions may themselves contain
    // links. This implementation catches the most common errors (such as forgetting to close a
    // template or a link), and even some more elaborate ones. With enough malice, this sanitation
    // can still be broken by user input such that the result is not well-formed wikitext as the
    // parser at the servers would like to have it. (It's still possible that the result is broken
    // wikitext, if the input was broken wikitext. But it never transforms well-formed wikitext
    // into broken wikitext.)
    //
    // If 'only_thumbs' is true, all [[Image: links are changed to [[:Image:, unless the original
    // image link was a thumbnail or had a width smaller than 300px specified.
    //
    // WARNING: do *not* attempt to use this to process large texts (e.g., a whole article). It is
    // probably rather inefficient due to the many substrings that are generated. This function is
    // primarily intended to be used to clean up user input in forms, which are typically rather
    // short.
    sanitizeWikiText: function sanitizeWikiText(input, only_thumbs) {
      if (input.search(/[[\]{}]|<nowiki(\s[^>]*)?>|<!--/) < 0) {
        return input;
      }
      // No critical characters
      if (!TextCleaner.imgNamespaceNames) {
        TextCleaner.imgNamespaceNames = [];
        var namespaceIds = mw.config.get('wgNamespaceIds');
        if (namespaceIds) {
          for (var name in namespaceIds) {
            if (namespaceIds[name] === 6) {
              // Image namespace
              TextCleaner.imgNamespaceNames.push(name);
            }
          }
        }
        // Make sure that we have the two canonical names
        TextCleaner.imgNamespaceNames.push('Image', 'File');
        // If your Wiki does not have wgNamespaceIds, add aliases or localized namespace names here!
      }

      var consumed = [0, 0];
      // For image captions. Image caption may contain links, and may even contain images.
      // The current MediaWiki parser actually allows this only once. For deeper recursions,
      // it fails. But here, it's actually easier to implement no limit.
      var base_regexp = new RegExp('[\\x01\\x02\\x03\\x04[\\]\\|\\x05\\x06\\x07\\x08]' + '|<nowiki(\\s[^>]*)?>|<!--', 'i'); // Ignore case
      var nowiki_regexp = new RegExp('<\\/nowiki(\\s[^>]*)?>|<!--', 'i');
      var allow_only_thumbs = only_thumbs;
      var sanitize = function sanitize(s, with_links, caption_level, allow_thumbs, break_at_pipe, with_tables, with_galleries) {
        if (!s || !s.length) {
          if (caption_level > 0) {
            if (consumed.length < caption_level) {
              consumed.push(0);
            } else {
              consumed[caption_level - 1] = 0;
            }
          }
          return s;
        }
        var result = '';
        var ch = '';
        var initial_length = s.length;
        var get_out = false;
        var in_nowiki = false;
        var endings = null;
        // Stack recording template and table nesting
        var next;
        var regexp = base_regexp;
        var push_end = function push_end(val) {
          if (!endings) {
            endings = [val];
          } else {
            endings.push(val);
          }
        };
        var pop_end = function pop_end() {
          if (!endings) {
            return null;
          } // Shouldn't happen
          var result;
          if (endings.length === 1) {
            result = endings[0];
            endings = null;
          } else {
            result = endings[endings.length - 1];
            endings.length--;
          }
          return result;
        };
        var get_initial = function get_initial(i, s) {
          for (var j = 0; j < TextCleaner.imgNamespaceNames.length; j++) {
            if (s.length >= i + TextCleaner.imgNamespaceNames[j].length + 1) {
              var t = s.substr(i, TextCleaner.imgNamespaceNames[j].length + 1);
              if (t.toLowerCase() === "".concat(TextCleaner.imgNamespaceNames[j].toLowerCase(), ":")) {
                return t;
              }
            }
          }
          return null;
        };
        while (s.length > 0 && !get_out) {
          next = s.search(regexp);
          if (next < 0) {
            result += s;
            break;
          }
          ch = s.charAt(next);
          var i = -1;
          var j = -1;
          var k = -1;
          switch (ch) {
            case '<':
              {
                // Nowiki or HTML comment. Must be closed.
                if (s.charAt(next + 1) === '!') {
                  // HTML comment. Cannot be nested.
                  i = s.indexOf('-->', next + 3);
                  if (i < 0) {
                    result += "".concat(s, "-->");
                    s = '';
                  } else {
                    result += s.substring(0, i + 3);
                    s = s.substring(i + 3);
                  }
                } else if (s.charAt(next + 1) === 'n') {
                  // Nowiki may contain HTML comments!
                  in_nowiki = true;
                  regexp = nowiki_regexp;
                  result += s.substring(0, next + 7);
                  s = s.substring(next + 7);
                } else {
                  // End of nowiki. Searched for and found only if in_nowiki === true
                  in_nowiki = false;
                  regexp = base_regexp;
                  i = s.indexOf('>', next + 1); // End of tag
                  result += s.substring(0, i + 1);
                  s = s.substring(i + 1);
                }
                break;
              }
            case "\x05":
              {
                // Table start
                if (!with_tables) {
                  result += s.substring(0, next);
                  get_out = true;
                  break;
                }
              }
            /* fall through */
            case "\x07":
              {
                if (ch === "\x07" && !with_galleries) {
                  result += s.substring(0, next);
                  get_out = true;
                  break;
                }
              }
            /* fall through */
            case "\x01":
              {
                // Start of template, table, or gallery
                result += s.substring(0, next + 1);
                push_end(String.fromCharCode(ch.charCodeAt(0) + 1).charAt(0));
                s = s.substring(next + 1);
                break;
              }
            case "\x06":
              {
                // Table end
                if (break_at_pipe && !endings) {
                  result += s.substring(0, next);
                  get_out = true;
                  break;
                }
              }
            /* fall through */
            case "\x02":
              {
                // End of a template or table
                result += s.substring(0, next);
                if (!endings || endings[endings.length - 1] !== ch) {
                  // Spurious template or table end
                  if (ch === "\x02") {
                    result += '&#x7D;&#x7D;';
                  } else {
                    result += '&#x7C;&#x7D;';
                  }
                } else {
                  result += pop_end();
                }
                s = s.substring(next + 1);
                break;
              }
            case "\b":
              {
                // End of gallery
                result += s.substring(0, next + 1);
                if (endings && endings[endings.length - 1] === ch) {
                  pop_end();
                }
                s = s.substring(next + 1);
                break;
              }
            case "\x03":
            case '[':
              {
                if (!with_links && !endings) {
                  get_out = true;
                  break;
                }
                // Image links must be treated specially, since they may contain nested links
                // in the caption!
                var initial = null; // If set, it's 'image:' or 'file:' and we have an image link
                i = next;
                while (i < s.length && s.charAt(i) === ch) {
                  i++;
                }
                if (ch === "\x03" && i < s.length && s.charAt(i) === '[') {
                  i++;
                }
                initial = get_initial(i, s);

                // Scan ahead. We'll break at the next top-level | or ] or ]] or [ or [[ or {| or |}
                var lk_text = sanitize(s.substring(i), false, caption_level + 1, false, true, false, false); // No galleries
                var lk_text_length = consumed[caption_level];
                j = i + lk_text_length;
                if (j >= s.length) {
                  // Used up the whole text: [[Foo or [bar
                  if (initial && allow_only_thumbs) {
                    // Should in any case have started with [[, not [
                    result += "".concat(s.substring(0, i - 1), "\x03:").concat(initial).concat(lk_text.substring(initial.length), "\x04");
                  } else {
                    result += s.substring(0, i) + lk_text + (s.charAt(i - 1) === '[' ? ']' : "\x04");
                  }
                  s = '';
                  break;
                }
                if (s.charAt(j) === '|') {
                  k = j;
                } else {
                  k = -1;
                }
                if (k < 0) {
                  // No pipe found: we should be on the closing ]] or ] or [[Foo]] or [bar]
                  if (initial && allow_only_thumbs) {
                    // Should in any case have started with [[, not [
                    result += "".concat(s.substring(0, i - 1), "\x03:").concat(initial).concat(lk_text.substring(initial.length), "\x04");
                  } else {
                    result += s.substring(0, i) + lk_text + (s.charAt(i - 1) === '[' ? ']' : "\x04");
                  }
                  if (s.charAt(j) === ']' || s.charAt(j) === "\x04") {
                    // Indeed closing the link
                    s = s.substring(j + 1);
                  } else {
                    s = s.substring(j);
                  }
                  break;
                } else {
                  var caption = null;
                  var used = 0;
                  // Pipe found.
                  if (!initial) {
                    // Not an image link. Must be something like [[Foo|Bar]].
                    caption = sanitize(s.substring(k + 1),
                    // No links, please
                    false, caption_level + 1,
                    // No thumbs either
                    false,
                    // Don't care about pipes
                    false,
                    // Allow tables (yes, parser allows that!)
                    true,
                    // Allow galleries (?)
                    true);
                    // Now we're at [[, [, ]], or ]
                    used = consumed[caption_level];
                    result += "".concat(s.substring(0, i) + lk_text, "|").concat(caption).concat(s.charAt(i - 1) === '[' ? ']' : "\x04");
                  } else {
                    var q = s.substring(k);
                    // We assume that there are no templates, nowikis, and other nasty things
                    // in the parameters. Search forward until the next [, {, ], }
                    var l = q.search(/[\u0001-\u0008[\]{}]/);
                    if (l < 0) {
                      l = q.length;
                    }
                    if (l + 1 < q.length) {
                      q = q.substring(0, l + 1);
                    }
                    var is_thumb = q.search(/\|\s*thumb(nail)?\s*[\u0004|]/) >= 0;
                    var img_width = /\|\s*(\d+)px\s*[\u0004|]/.exec(q);
                    if (img_width && img_width.length > 1) {
                      img_width = Number.parseInt(img_width[1], 10);
                      if (Number.isNaN(img_width)) {
                        img_width = null;
                      }
                    } else {
                      img_width = null;
                    }
                    if (!img_width) {
                      img_width = is_thumb ? 180 : 301;
                    }
                    var is_small = img_width <= 300;

                    // Caption starts at the last pipe before l. If that is a parameter,
                    // it doesn't hurt.
                    var m = k + q.lastIndexOf('|', l);
                    caption = sanitize(s.substring(m + 1),
                    // Allow links only if it's a thumb
                    is_thumb, caption_level + 1, allow_thumbs && is_thumb,
                    // Don't break at pipe
                    false,
                    // Tables only if it's a thumb
                    is_thumb,
                    // Allow galleries for thumbs (?)
                    is_thumb);
                    used = consumed[caption_level];
                    // caption used 'used' chars from m+1, s.charAt(m+1+used) === '\x04'
                    is_thumb = allow_thumbs && is_small;
                    if (is_thumb || !allow_only_thumbs) {
                      result += "".concat(s.substring(0, i - 1), "\x03").concat(lk_text);
                    } else {
                      result += "".concat(s.substring(0, i - 1), "\x03:").concat(initial).concat(lk_text.substring(initial.length));
                    }
                    result += "".concat(s.substring(k, m + 1) + caption, "\x04");
                    k = m;
                  }
                  next = k + 1 + used;
                  if (next < s.length) {
                    if (s.charAt(next) !== "\x04") {
                      s = s.substring(next);
                    } else {
                      s = s.substring(next + 1);
                    }
                  } else {
                    s = '';
                  }
                }
                break;
              }
            case "\x04":
            case ']':
              {
                // Extra bracket.
                result += s.substring(0, next);
                if (!caption_level && !break_at_pipe) {
                  result += ch === ']' ? '&#x5D;' : '&#x5D;&#x5D;';
                  s = s.substring(next + 1);
                } else {
                  get_out = true;
                }
                break;
              }
            case '|':
              {
                result += s.substring(0, next);
                if (break_at_pipe && !endings) {
                  // Pipe character at top level
                  get_out = true;
                } else {
                  if (!caption_level && !break_at_pipe && !endings) {
                    result += '&#x7C;'; // Top-level pipe character
                  } else {
                    result += '|';
                  }
                  s = s.substring(next + 1);
                }
                break;
              }
          } // end switch
        } // end while
        if (in_nowiki) {
          result += '</nowiki>';
        } // Make sure this nowiki is closed.

        // Close open templates and tables
        while (endings) {
          ch = pop_end();
          result += (ch === "\x06" ? '\n' : '') + ch;
        }
        if (caption_level > 0) {
          var used_up = initial_length - (get_out ? s.length - next : 0);
          if (consumed.length < caption_level) {
            consumed.push(used_up);
          } else {
            consumed[caption_level - 1] = used_up;
          }
        }
        return result;
      };

      // Replace multi-character tokens by one-character placeholders, simplifying the
      // subsequent processing.
      var s = input.replace(/{{/g, "\x01").replace(/\n\s*\|}}}/g, "\n\x06\x02") // Table end + template end
      .replace(/}}/g, "\x02").replace(/\[\[/g, "\x03").replace(/]]/g, "\x04").replace(/\n\s*{\|/g, "\n\x05") // Table start and end must be on own line
      .replace(/^\s*{\|/, "\x05") // Table start at the very beginning
      .replace(/\n\s*\|}/g, "\n\x06") // (we strip leading whitespace)
      .replace(/<\s*gallery\s*>/g, "\x07").replace(/<\/\s*gallery\s*>/g, "\b");
      s = sanitize(s, true, 0, true, false, true, true);
      // with links, allow thumbs, don't break at pipe, allow tables, allow galleries
      return s.replace(/\u0001/g, '{{').replace(/\u0002/g, '}}').replace(/\u0003/g, '[[').replace(/\u0004/g, ']]').replace(/\u0005/g, '{|').replace(/\u0006/g, '|}').replace(/\u0007/g, '<gallery>').replace(/\u0008/g, '</gallery>');
    }
  };
})(mediaWiki);

/* </nowiki> */