a261733648d552ce1274d2abbae2b23c7e3ad9e5.svn-base 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566
  1. package cn.com.goldenwater.dcproj.utils;
  2. import java.util.*;
  3. import java.util.concurrent.ConcurrentHashMap;
  4. import java.util.concurrent.ConcurrentMap;
  5. import java.util.regex.Matcher;
  6. import java.util.regex.Pattern;
  7. /**
  8. * HTML过滤器,用于去除XSS漏洞隐患。
  9. *
  10. * @author ruoyi
  11. */
  12. public final class HTMLFilter
  13. {
  14. /**
  15. * regex flag union representing /si modifiers in php
  16. **/
  17. private static final int REGEX_FLAGS_SI = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
  18. private static final Pattern P_COMMENTS = Pattern.compile("<!--(.*?)-->", Pattern.DOTALL);
  19. private static final Pattern P_COMMENT = Pattern.compile("^!--(.*)--$", REGEX_FLAGS_SI);
  20. private static final Pattern P_TAGS = Pattern.compile("<(.*?)>", Pattern.DOTALL);
  21. private static final Pattern P_END_TAG = Pattern.compile("^/([a-z0-9]+)", REGEX_FLAGS_SI);
  22. private static final Pattern P_START_TAG = Pattern.compile("^([a-z0-9]+)(.*?)(/?)$", REGEX_FLAGS_SI);
  23. private static final Pattern P_QUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)=([\"'])(.*?)\\2", REGEX_FLAGS_SI);
  24. private static final Pattern P_UNQUOTED_ATTRIBUTES = Pattern.compile("([a-z0-9]+)(=)([^\"\\s']+)", REGEX_FLAGS_SI);
  25. private static final Pattern P_PROTOCOL = Pattern.compile("^([^:]+):", REGEX_FLAGS_SI);
  26. private static final Pattern P_ENTITY = Pattern.compile("&#(\\d+);?");
  27. private static final Pattern P_ENTITY_UNICODE = Pattern.compile("&#x([0-9a-f]+);?");
  28. private static final Pattern P_ENCODE = Pattern.compile("%([0-9a-f]{2});?");
  29. private static final Pattern P_VALID_ENTITIES = Pattern.compile("&([^&;]*)(?=(;|&|$))");
  30. private static final Pattern P_VALID_QUOTES = Pattern.compile("(>|^)([^<]+?)(<|$)", Pattern.DOTALL);
  31. private static final Pattern P_END_ARROW = Pattern.compile("^>");
  32. private static final Pattern P_BODY_TO_END = Pattern.compile("<([^>]*?)(?=<|$)");
  33. private static final Pattern P_XML_CONTENT = Pattern.compile("(^|>)([^<]*?)(?=>)");
  34. private static final Pattern P_STRAY_LEFT_ARROW = Pattern.compile("<([^>]*?)(?=<|$)");
  35. private static final Pattern P_STRAY_RIGHT_ARROW = Pattern.compile("(^|>)([^<]*?)(?=>)");
  36. private static final Pattern P_AMP = Pattern.compile("&");
  37. private static final Pattern P_QUOTE = Pattern.compile("\"");
  38. private static final Pattern P_LEFT_ARROW = Pattern.compile("<");
  39. private static final Pattern P_RIGHT_ARROW = Pattern.compile(">");
  40. private static final Pattern P_BOTH_ARROWS = Pattern.compile("<>");
  41. // @xxx could grow large... maybe use sesat's ReferenceMap
  42. private static final ConcurrentMap<String, Pattern> P_REMOVE_PAIR_BLANKS = new ConcurrentHashMap<>();
  43. private static final ConcurrentMap<String, Pattern> P_REMOVE_SELF_BLANKS = new ConcurrentHashMap<>();
  44. /**
  45. * set of allowed html elements, along with allowed attributes for each element
  46. **/
  47. private final Map<String, List<String>> vAllowed;
  48. /**
  49. * counts of open tags for each (allowable) html element
  50. **/
  51. private final Map<String, Integer> vTagCounts = new HashMap<>();
  52. /**
  53. * html elements which must always be self-closing (e.g. "<img />")
  54. **/
  55. private final String[] vSelfClosingTags;
  56. /**
  57. * html elements which must always have separate opening and closing tags (e.g. "<b></b>")
  58. **/
  59. private final String[] vNeedClosingTags;
  60. /**
  61. * set of disallowed html elements
  62. **/
  63. private final String[] vDisallowed;
  64. /**
  65. * attributes which should be checked for valid protocols
  66. **/
  67. private final String[] vProtocolAtts;
  68. /**
  69. * allowed protocols
  70. **/
  71. private final String[] vAllowedProtocols;
  72. /**
  73. * tags which should be removed if they contain no content (e.g. "<b></b>" or "<b />")
  74. **/
  75. private final String[] vRemoveBlanks;
  76. /**
  77. * entities allowed within html markup
  78. **/
  79. private final String[] vAllowedEntities;
  80. /**
  81. * flag determining whether comments are allowed in input String.
  82. */
  83. private final boolean stripComment;
  84. private final boolean encodeQuotes;
  85. /**
  86. * flag determining whether to try to make tags when presented with "unbalanced" angle brackets (e.g. "<b text </b>"
  87. * becomes "<b> text </b>"). If set to false, unbalanced angle brackets will be html escaped.
  88. */
  89. private final boolean alwaysMakeTags;
  90. /**
  91. * Default constructor.
  92. */
  93. public HTMLFilter()
  94. {
  95. vAllowed = new HashMap<>();
  96. final ArrayList<String> a_atts = new ArrayList<>();
  97. a_atts.add("href");
  98. a_atts.add("target");
  99. vAllowed.put("a", a_atts);
  100. final ArrayList<String> img_atts = new ArrayList<>();
  101. img_atts.add("src");
  102. img_atts.add("width");
  103. img_atts.add("height");
  104. img_atts.add("alt");
  105. vAllowed.put("img", img_atts);
  106. final ArrayList<String> no_atts = new ArrayList<>();
  107. vAllowed.put("b", no_atts);
  108. vAllowed.put("strong", no_atts);
  109. vAllowed.put("i", no_atts);
  110. vAllowed.put("em", no_atts);
  111. vSelfClosingTags = new String[] { "img" };
  112. vNeedClosingTags = new String[] { "a", "b", "strong", "i", "em" };
  113. vDisallowed = new String[] {};
  114. vAllowedProtocols = new String[] { "http", "mailto", "https" }; // no ftp.
  115. vProtocolAtts = new String[] { "src", "href" };
  116. vRemoveBlanks = new String[] { "a", "b", "strong", "i", "em" };
  117. vAllowedEntities = new String[] { "amp", "gt", "lt", "quot" };
  118. stripComment = true;
  119. encodeQuotes = true;
  120. alwaysMakeTags = false;
  121. }
  122. /**
  123. * Map-parameter configurable constructor.
  124. *
  125. * @param conf map containing configuration. keys match field names.
  126. */
  127. @SuppressWarnings("unchecked")
  128. public HTMLFilter(final Map<String, Object> conf)
  129. {
  130. assert conf.containsKey("vAllowed") : "configuration requires vAllowed";
  131. assert conf.containsKey("vSelfClosingTags") : "configuration requires vSelfClosingTags";
  132. assert conf.containsKey("vNeedClosingTags") : "configuration requires vNeedClosingTags";
  133. assert conf.containsKey("vDisallowed") : "configuration requires vDisallowed";
  134. assert conf.containsKey("vAllowedProtocols") : "configuration requires vAllowedProtocols";
  135. assert conf.containsKey("vProtocolAtts") : "configuration requires vProtocolAtts";
  136. assert conf.containsKey("vRemoveBlanks") : "configuration requires vRemoveBlanks";
  137. assert conf.containsKey("vAllowedEntities") : "configuration requires vAllowedEntities";
  138. vAllowed = Collections.unmodifiableMap((HashMap<String, List<String>>) conf.get("vAllowed"));
  139. vSelfClosingTags = (String[]) conf.get("vSelfClosingTags");
  140. vNeedClosingTags = (String[]) conf.get("vNeedClosingTags");
  141. vDisallowed = (String[]) conf.get("vDisallowed");
  142. vAllowedProtocols = (String[]) conf.get("vAllowedProtocols");
  143. vProtocolAtts = (String[]) conf.get("vProtocolAtts");
  144. vRemoveBlanks = (String[]) conf.get("vRemoveBlanks");
  145. vAllowedEntities = (String[]) conf.get("vAllowedEntities");
  146. stripComment = conf.containsKey("stripComment") ? (Boolean) conf.get("stripComment") : true;
  147. encodeQuotes = conf.containsKey("encodeQuotes") ? (Boolean) conf.get("encodeQuotes") : true;
  148. alwaysMakeTags = conf.containsKey("alwaysMakeTags") ? (Boolean) conf.get("alwaysMakeTags") : true;
  149. }
  150. private void reset()
  151. {
  152. vTagCounts.clear();
  153. }
  154. // ---------------------------------------------------------------
  155. // my versions of some PHP library functions
  156. public static String chr(final int decimal)
  157. {
  158. return String.valueOf((char) decimal);
  159. }
  160. public static String htmlSpecialChars(final String s)
  161. {
  162. String result = s;
  163. result = regexReplace(P_AMP, "&amp;", result);
  164. result = regexReplace(P_QUOTE, "&quot;", result);
  165. result = regexReplace(P_LEFT_ARROW, "&lt;", result);
  166. result = regexReplace(P_RIGHT_ARROW, "&gt;", result);
  167. return result;
  168. }
  169. // ---------------------------------------------------------------
  170. /**
  171. * given a user submitted input String, filter out any invalid or restricted html.
  172. *
  173. * @param input text (i.e. submitted by a user) than may contain html
  174. * @return "clean" version of input, with only valid, whitelisted html elements allowed
  175. */
  176. public String filter(final String input)
  177. {
  178. reset();
  179. String s = input;
  180. s = escapeComments(s);
  181. s = balanceHTML(s);
  182. s = checkTags(s);
  183. s = processRemoveBlanks(s);
  184. // s = validateEntities(s);
  185. return s;
  186. }
  187. public boolean isAlwaysMakeTags()
  188. {
  189. return alwaysMakeTags;
  190. }
  191. public boolean isStripComments()
  192. {
  193. return stripComment;
  194. }
  195. private String escapeComments(final String s)
  196. {
  197. final Matcher m = P_COMMENTS.matcher(s);
  198. final StringBuffer buf = new StringBuffer();
  199. if (m.find())
  200. {
  201. final String match = m.group(1); // (.*?)
  202. m.appendReplacement(buf, Matcher.quoteReplacement("<!--" + htmlSpecialChars(match) + "-->"));
  203. }
  204. m.appendTail(buf);
  205. return buf.toString();
  206. }
  207. private String balanceHTML(String s)
  208. {
  209. if (alwaysMakeTags)
  210. {
  211. //
  212. // try and form html
  213. //
  214. s = regexReplace(P_END_ARROW, "", s);
  215. // 不追加结束标签
  216. s = regexReplace(P_BODY_TO_END, "<$1>", s);
  217. s = regexReplace(P_XML_CONTENT, "$1<$2", s);
  218. }
  219. else
  220. {
  221. //
  222. // escape stray brackets
  223. //
  224. s = regexReplace(P_STRAY_LEFT_ARROW, "&lt;$1", s);
  225. s = regexReplace(P_STRAY_RIGHT_ARROW, "$1$2&gt;<", s);
  226. //
  227. // the last regexp causes '<>' entities to appear
  228. // (we need to do a lookahead assertion so that the last bracket can
  229. // be used in the next pass of the regexp)
  230. //
  231. s = regexReplace(P_BOTH_ARROWS, "", s);
  232. }
  233. return s;
  234. }
  235. private String checkTags(String s)
  236. {
  237. Matcher m = P_TAGS.matcher(s);
  238. final StringBuffer buf = new StringBuffer();
  239. while (m.find())
  240. {
  241. String replaceStr = m.group(1);
  242. replaceStr = processTag(replaceStr);
  243. m.appendReplacement(buf, Matcher.quoteReplacement(replaceStr));
  244. }
  245. m.appendTail(buf);
  246. // these get tallied in processTag
  247. // (remember to reset before subsequent calls to filter method)
  248. final StringBuilder sBuilder = new StringBuilder(buf.toString());
  249. for (String key : vTagCounts.keySet())
  250. {
  251. for (int ii = 0; ii < vTagCounts.get(key); ii++)
  252. {
  253. sBuilder.append("</").append(key).append(">");
  254. }
  255. }
  256. s = sBuilder.toString();
  257. return s;
  258. }
  259. private String processRemoveBlanks(final String s)
  260. {
  261. String result = s;
  262. for (String tag : vRemoveBlanks)
  263. {
  264. if (!P_REMOVE_PAIR_BLANKS.containsKey(tag))
  265. {
  266. P_REMOVE_PAIR_BLANKS.putIfAbsent(tag, Pattern.compile("<" + tag + "(\\s[^>]*)?></" + tag + ">"));
  267. }
  268. result = regexReplace(P_REMOVE_PAIR_BLANKS.get(tag), "", result);
  269. if (!P_REMOVE_SELF_BLANKS.containsKey(tag))
  270. {
  271. P_REMOVE_SELF_BLANKS.putIfAbsent(tag, Pattern.compile("<" + tag + "(\\s[^>]*)?/>"));
  272. }
  273. result = regexReplace(P_REMOVE_SELF_BLANKS.get(tag), "", result);
  274. }
  275. return result;
  276. }
  277. private static String regexReplace(final Pattern regex_pattern, final String replacement, final String s)
  278. {
  279. Matcher m = regex_pattern.matcher(s);
  280. return m.replaceAll(replacement);
  281. }
  282. private String processTag(final String s)
  283. {
  284. // ending tags
  285. Matcher m = P_END_TAG.matcher(s);
  286. if (m.find())
  287. {
  288. final String name = m.group(1).toLowerCase();
  289. if (allowed(name))
  290. {
  291. if (!inArray(name, vSelfClosingTags))
  292. {
  293. if (vTagCounts.containsKey(name))
  294. {
  295. vTagCounts.put(name, vTagCounts.get(name) - 1);
  296. return "</" + name + ">";
  297. }
  298. }
  299. }
  300. }
  301. // starting tags
  302. m = P_START_TAG.matcher(s);
  303. if (m.find())
  304. {
  305. final String name = m.group(1).toLowerCase();
  306. final String body = m.group(2);
  307. String ending = m.group(3);
  308. // debug( "in a starting tag, name='" + name + "'; body='" + body + "'; ending='" + ending + "'" );
  309. if (allowed(name))
  310. {
  311. final StringBuilder params = new StringBuilder();
  312. final Matcher m2 = P_QUOTED_ATTRIBUTES.matcher(body);
  313. final Matcher m3 = P_UNQUOTED_ATTRIBUTES.matcher(body);
  314. final List<String> paramNames = new ArrayList<>();
  315. final List<String> paramValues = new ArrayList<>();
  316. while (m2.find())
  317. {
  318. paramNames.add(m2.group(1)); // ([a-z0-9]+)
  319. paramValues.add(m2.group(3)); // (.*?)
  320. }
  321. while (m3.find())
  322. {
  323. paramNames.add(m3.group(1)); // ([a-z0-9]+)
  324. paramValues.add(m3.group(3)); // ([^\"\\s']+)
  325. }
  326. String paramName, paramValue;
  327. for (int ii = 0; ii < paramNames.size(); ii++)
  328. {
  329. paramName = paramNames.get(ii).toLowerCase();
  330. paramValue = paramValues.get(ii);
  331. // debug( "paramName='" + paramName + "'" );
  332. // debug( "paramValue='" + paramValue + "'" );
  333. // debug( "allowed? " + vAllowed.get( name ).contains( paramName ) );
  334. if (allowedAttribute(name, paramName))
  335. {
  336. if (inArray(paramName, vProtocolAtts))
  337. {
  338. paramValue = processParamProtocol(paramValue);
  339. }
  340. params.append(' ').append(paramName).append("=\\\"").append(paramValue).append("\"");
  341. }
  342. }
  343. if (inArray(name, vSelfClosingTags))
  344. {
  345. ending = " /";
  346. }
  347. if (inArray(name, vNeedClosingTags))
  348. {
  349. ending = "";
  350. }
  351. if (ending == null || ending.length() < 1)
  352. {
  353. if (vTagCounts.containsKey(name))
  354. {
  355. vTagCounts.put(name, vTagCounts.get(name) + 1);
  356. }
  357. else
  358. {
  359. vTagCounts.put(name, 1);
  360. }
  361. }
  362. else
  363. {
  364. ending = " /";
  365. }
  366. return "<" + name + params + ending + ">";
  367. }
  368. else
  369. {
  370. return "";
  371. }
  372. }
  373. // comments
  374. m = P_COMMENT.matcher(s);
  375. if (!stripComment && m.find())
  376. {
  377. return "<" + m.group() + ">";
  378. }
  379. return "";
  380. }
  381. private String processParamProtocol(String s)
  382. {
  383. s = decodeEntities(s);
  384. final Matcher m = P_PROTOCOL.matcher(s);
  385. if (m.find())
  386. {
  387. final String protocol = m.group(1);
  388. if (!inArray(protocol, vAllowedProtocols))
  389. {
  390. // bad protocol, turn into local anchor link instead
  391. s = "#" + s.substring(protocol.length() + 1);
  392. if (s.startsWith("#//"))
  393. {
  394. s = "#" + s.substring(3);
  395. }
  396. }
  397. }
  398. return s;
  399. }
  400. private String decodeEntities(String s)
  401. {
  402. StringBuffer buf = new StringBuffer();
  403. Matcher m = P_ENTITY.matcher(s);
  404. while (m.find())
  405. {
  406. final String match = m.group(1);
  407. final int decimal = Integer.decode(match).intValue();
  408. m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
  409. }
  410. m.appendTail(buf);
  411. s = buf.toString();
  412. buf = new StringBuffer();
  413. m = P_ENTITY_UNICODE.matcher(s);
  414. while (m.find())
  415. {
  416. final String match = m.group(1);
  417. final int decimal = Integer.valueOf(match, 16).intValue();
  418. m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
  419. }
  420. m.appendTail(buf);
  421. s = buf.toString();
  422. buf = new StringBuffer();
  423. m = P_ENCODE.matcher(s);
  424. while (m.find())
  425. {
  426. final String match = m.group(1);
  427. final int decimal = Integer.valueOf(match, 16).intValue();
  428. m.appendReplacement(buf, Matcher.quoteReplacement(chr(decimal)));
  429. }
  430. m.appendTail(buf);
  431. s = buf.toString();
  432. s = validateEntities(s);
  433. return s;
  434. }
  435. private String validateEntities(final String s)
  436. {
  437. StringBuffer buf = new StringBuffer();
  438. // validate entities throughout the string
  439. Matcher m = P_VALID_ENTITIES.matcher(s);
  440. while (m.find())
  441. {
  442. final String one = m.group(1); // ([^&;]*)
  443. final String two = m.group(2); // (?=(;|&|$))
  444. m.appendReplacement(buf, Matcher.quoteReplacement(checkEntity(one, two)));
  445. }
  446. m.appendTail(buf);
  447. return encodeQuotes(buf.toString());
  448. }
  449. private String encodeQuotes(final String s)
  450. {
  451. if (encodeQuotes)
  452. {
  453. StringBuffer buf = new StringBuffer();
  454. Matcher m = P_VALID_QUOTES.matcher(s);
  455. while (m.find())
  456. {
  457. final String one = m.group(1); // (>|^)
  458. final String two = m.group(2); // ([^<]+?)
  459. final String three = m.group(3); // (<|$)
  460. // 不替换双引号为&quot;,防止json格式无效 regexReplace(P_QUOTE, "&quot;", two)
  461. m.appendReplacement(buf, Matcher.quoteReplacement(one + two + three));
  462. }
  463. m.appendTail(buf);
  464. return buf.toString();
  465. }
  466. else
  467. {
  468. return s;
  469. }
  470. }
  471. private String checkEntity(final String preamble, final String term)
  472. {
  473. return ";".equals(term) && isValidEntity(preamble) ? '&' + preamble : "&amp;" + preamble;
  474. }
  475. private boolean isValidEntity(final String entity)
  476. {
  477. return inArray(entity, vAllowedEntities);
  478. }
  479. private static boolean inArray(final String s, final String[] array)
  480. {
  481. for (String item : array)
  482. {
  483. if (item != null && item.equals(s))
  484. {
  485. return true;
  486. }
  487. }
  488. return false;
  489. }
  490. private boolean allowed(final String name)
  491. {
  492. return (vAllowed.isEmpty() || vAllowed.containsKey(name)) && !inArray(name, vDisallowed);
  493. }
  494. private boolean allowedAttribute(final String name, final String paramName)
  495. {
  496. return allowed(name) && (vAllowed.isEmpty() || vAllowed.get(name).contains(paramName));
  497. }
  498. }