Index: test/test_builder.rb =================================================================== --- test/test_builder.rb (revision 161) +++ test/test_builder.rb (working copy) @@ -27,4 +27,11 @@ assert_equal "Some text", Hpricot(text).to_html end + + def test_korean_utf8_entities + # a = '한글' + a = "\xed\x95\x9c\xea\xb8\x80" + doc = Hpricot() { b a } + assert_equal "한글", doc.to_html + end end Index: ext/fast_xs/FastXsService.java =================================================================== --- ext/fast_xs/FastXsService.java (revision 0) +++ ext/fast_xs/FastXsService.java (revision 0) @@ -0,0 +1,1018 @@ + +import java.io.IOException; +import java.io.StringWriter; +import java.io.Writer; +import java.util.HashMap; +import java.util.Map; +import java.util.TreeMap; +import org.jruby.Ruby; +import org.jruby.RubyModule; +import org.jruby.runtime.CallbackFactory; +import org.jruby.runtime.builtin.IRubyObject; +import org.jruby.runtime.load.BasicLibraryService; +import org.jruby.util.collections.IntHashMap; + +public class FastXsService implements BasicLibraryService { + + public boolean basicLoad(final Ruby runtime) throws IOException { + RubyModule string = runtime.getModule("String"); + CallbackFactory fact = runtime.callbackFactory(FastXsService.class); + string.defineMethod("fast_xs",fact.getFastSingletonMethod("fast_xs")); + return true; + } + + public static IRubyObject fast_xs(IRubyObject recv) { + String string = recv.convertToString().getUnicodeValue(); + StringWriter writer = new StringWriter ((int)(string.length() * 1.5)); + try { + Entities.HTML40.escape(writer, string); + return recv.getRuntime().newString(writer.toString()); + } catch (IOException e) { + throw recv.getRuntime().newIOErrorFromException(e); + } + } +} + +// From Apache commons-lang, +// http://svn.apache.org/viewvc/commons/proper/lang/trunk/src/java/org/apache/commons/lang/Entities.java?revision=560660&view=markup +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + *
+ * Provides HTML and XML entity utilities. + *
+ * + * @see ISO Entities + * @see HTML 3.2 Character Entities for ISO Latin-1 + * @see HTML 4.0 Character entity references + * @see HTML 4.01 Character References + * @see HTML 4.01 Code positions + * + * @author Alexander Day Chaffee + * @author Gary Gregory + * @since 2.0 + * @version $Id$ + */ +class Entities { + + private static final String[][] BASIC_ARRAY = {{"quot", "34"}, // " - double-quote + {"amp", "38"}, // & - ampersand + {"lt", "60"}, // < - less-than + {"gt", "62"}, // > - greater-than + }; + + private static final String[][] APOS_ARRAY = {{"apos", "39"}, // XML apostrophe + }; + + // package scoped for testing + static final String[][] ISO8859_1_ARRAY = {{"nbsp", "160"}, // non-breaking space + {"iexcl", "161"}, // inverted exclamation mark + {"cent", "162"}, // cent sign + {"pound", "163"}, // pound sign + {"curren", "164"}, // currency sign + {"yen", "165"}, // yen sign = yuan sign + {"brvbar", "166"}, // broken bar = broken vertical bar + {"sect", "167"}, // section sign + {"uml", "168"}, // diaeresis = spacing diaeresis + {"copy", "169"}, // © - copyright sign + {"ordf", "170"}, // feminine ordinal indicator + {"laquo", "171"}, // left-pointing double angle quotation mark = left pointing guillemet + {"not", "172"}, // not sign + {"shy", "173"}, // soft hyphen = discretionary hyphen + {"reg", "174"}, // ® - registered trademark sign + {"macr", "175"}, // macron = spacing macron = overline = APL overbar + {"deg", "176"}, // degree sign + {"plusmn", "177"}, // plus-minus sign = plus-or-minus sign + {"sup2", "178"}, // superscript two = superscript digit two = squared + {"sup3", "179"}, // superscript three = superscript digit three = cubed + {"acute", "180"}, // acute accent = spacing acute + {"micro", "181"}, // micro sign + {"para", "182"}, // pilcrow sign = paragraph sign + {"middot", "183"}, // middle dot = Georgian comma = Greek middle dot + {"cedil", "184"}, // cedilla = spacing cedilla + {"sup1", "185"}, // superscript one = superscript digit one + {"ordm", "186"}, // masculine ordinal indicator + {"raquo", "187"}, // right-pointing double angle quotation mark = right pointing guillemet + {"frac14", "188"}, // vulgar fraction one quarter = fraction one quarter + {"frac12", "189"}, // vulgar fraction one half = fraction one half + {"frac34", "190"}, // vulgar fraction three quarters = fraction three quarters + {"iquest", "191"}, // inverted question mark = turned question mark + {"Agrave", "192"}, // À - uppercase A, grave accent + {"Aacute", "193"}, // Á - uppercase A, acute accent + {"Acirc", "194"}, // Â - uppercase A, circumflex accent + {"Atilde", "195"}, // Ã - uppercase A, tilde + {"Auml", "196"}, // Ä - uppercase A, umlaut + {"Aring", "197"}, // Å - uppercase A, ring + {"AElig", "198"}, // Æ - uppercase AE + {"Ccedil", "199"}, // Ç - uppercase C, cedilla + {"Egrave", "200"}, // È - uppercase E, grave accent + {"Eacute", "201"}, // É - uppercase E, acute accent + {"Ecirc", "202"}, // Ê - uppercase E, circumflex accent + {"Euml", "203"}, // Ë - uppercase E, umlaut + {"Igrave", "204"}, // Ì - uppercase I, grave accent + {"Iacute", "205"}, // Í - uppercase I, acute accent + {"Icirc", "206"}, // Î - uppercase I, circumflex accent + {"Iuml", "207"}, // Ï - uppercase I, umlaut + {"ETH", "208"}, // Ð - uppercase Eth, Icelandic + {"Ntilde", "209"}, // Ñ - uppercase N, tilde + {"Ograve", "210"}, // Ò - uppercase O, grave accent + {"Oacute", "211"}, // Ó - uppercase O, acute accent + {"Ocirc", "212"}, // Ô - uppercase O, circumflex accent + {"Otilde", "213"}, // Õ - uppercase O, tilde + {"Ouml", "214"}, // Ö - uppercase O, umlaut + {"times", "215"}, // multiplication sign + {"Oslash", "216"}, // Ø - uppercase O, slash + {"Ugrave", "217"}, // Ù - uppercase U, grave accent + {"Uacute", "218"}, // Ú - uppercase U, acute accent + {"Ucirc", "219"}, // Û - uppercase U, circumflex accent + {"Uuml", "220"}, // Ü - uppercase U, umlaut + {"Yacute", "221"}, // Ý - uppercase Y, acute accent + {"THORN", "222"}, // Þ - uppercase THORN, Icelandic + {"szlig", "223"}, // ß - lowercase sharps, German + {"agrave", "224"}, // à - lowercase a, grave accent + {"aacute", "225"}, // á - lowercase a, acute accent + {"acirc", "226"}, // â - lowercase a, circumflex accent + {"atilde", "227"}, // ã - lowercase a, tilde + {"auml", "228"}, // ä - lowercase a, umlaut + {"aring", "229"}, // å - lowercase a, ring + {"aelig", "230"}, // æ - lowercase ae + {"ccedil", "231"}, // ç - lowercase c, cedilla + {"egrave", "232"}, // è - lowercase e, grave accent + {"eacute", "233"}, // é - lowercase e, acute accent + {"ecirc", "234"}, // ê - lowercase e, circumflex accent + {"euml", "235"}, // ë - lowercase e, umlaut + {"igrave", "236"}, // ì - lowercase i, grave accent + {"iacute", "237"}, // í - lowercase i, acute accent + {"icirc", "238"}, // î - lowercase i, circumflex accent + {"iuml", "239"}, // ï - lowercase i, umlaut + {"eth", "240"}, // ð - lowercase eth, Icelandic + {"ntilde", "241"}, // ñ - lowercase n, tilde + {"ograve", "242"}, // ò - lowercase o, grave accent + {"oacute", "243"}, // ó - lowercase o, acute accent + {"ocirc", "244"}, // ô - lowercase o, circumflex accent + {"otilde", "245"}, // õ - lowercase o, tilde + {"ouml", "246"}, // ö - lowercase o, umlaut + {"divide", "247"}, // division sign + {"oslash", "248"}, // ø - lowercase o, slash + {"ugrave", "249"}, // ù - lowercase u, grave accent + {"uacute", "250"}, // ú - lowercase u, acute accent + {"ucirc", "251"}, // û - lowercase u, circumflex accent + {"uuml", "252"}, // ü - lowercase u, umlaut + {"yacute", "253"}, // ý - lowercase y, acute accent + {"thorn", "254"}, // þ - lowercase thorn, Icelandic + {"yuml", "255"}, // ÿ - lowercase y, umlaut + }; + + // http://www.w3.org/TR/REC-html40/sgml/entities.html + // package scoped for testing + static final String[][] HTML40_ARRAY = { + // + {"fnof", "402"}, // latin small f with hook = function= florin, U+0192 ISOtech --> + // + {"Alpha", "913"}, // greek capital letter alpha, U+0391 --> + {"Beta", "914"}, // greek capital letter beta, U+0392 --> + {"Gamma", "915"}, // greek capital letter gamma,U+0393 ISOgrk3 --> + {"Delta", "916"}, // greek capital letter delta,U+0394 ISOgrk3 --> + {"Epsilon", "917"}, // greek capital letter epsilon, U+0395 --> + {"Zeta", "918"}, // greek capital letter zeta, U+0396 --> + {"Eta", "919"}, // greek capital letter eta, U+0397 --> + {"Theta", "920"}, // greek capital letter theta,U+0398 ISOgrk3 --> + {"Iota", "921"}, // greek capital letter iota, U+0399 --> + {"Kappa", "922"}, // greek capital letter kappa, U+039A --> + {"Lambda", "923"}, // greek capital letter lambda,U+039B ISOgrk3 --> + {"Mu", "924"}, // greek capital letter mu, U+039C --> + {"Nu", "925"}, // greek capital letter nu, U+039D --> + {"Xi", "926"}, // greek capital letter xi, U+039E ISOgrk3 --> + {"Omicron", "927"}, // greek capital letter omicron, U+039F --> + {"Pi", "928"}, // greek capital letter pi, U+03A0 ISOgrk3 --> + {"Rho", "929"}, // greek capital letter rho, U+03A1 --> + // + {"Sigma", "931"}, // greek capital letter sigma,U+03A3 ISOgrk3 --> + {"Tau", "932"}, // greek capital letter tau, U+03A4 --> + {"Upsilon", "933"}, // greek capital letter upsilon,U+03A5 ISOgrk3 --> + {"Phi", "934"}, // greek capital letter phi,U+03A6 ISOgrk3 --> + {"Chi", "935"}, // greek capital letter chi, U+03A7 --> + {"Psi", "936"}, // greek capital letter psi,U+03A8 ISOgrk3 --> + {"Omega", "937"}, // greek capital letter omega,U+03A9 ISOgrk3 --> + {"alpha", "945"}, // greek small letter alpha,U+03B1 ISOgrk3 --> + {"beta", "946"}, // greek small letter beta, U+03B2 ISOgrk3 --> + {"gamma", "947"}, // greek small letter gamma,U+03B3 ISOgrk3 --> + {"delta", "948"}, // greek small letter delta,U+03B4 ISOgrk3 --> + {"epsilon", "949"}, // greek small letter epsilon,U+03B5 ISOgrk3 --> + {"zeta", "950"}, // greek small letter zeta, U+03B6 ISOgrk3 --> + {"eta", "951"}, // greek small letter eta, U+03B7 ISOgrk3 --> + {"theta", "952"}, // greek small letter theta,U+03B8 ISOgrk3 --> + {"iota", "953"}, // greek small letter iota, U+03B9 ISOgrk3 --> + {"kappa", "954"}, // greek small letter kappa,U+03BA ISOgrk3 --> + {"lambda", "955"}, // greek small letter lambda,U+03BB ISOgrk3 --> + {"mu", "956"}, // greek small letter mu, U+03BC ISOgrk3 --> + {"nu", "957"}, // greek small letter nu, U+03BD ISOgrk3 --> + {"xi", "958"}, // greek small letter xi, U+03BE ISOgrk3 --> + {"omicron", "959"}, // greek small letter omicron, U+03BF NEW --> + {"pi", "960"}, // greek small letter pi, U+03C0 ISOgrk3 --> + {"rho", "961"}, // greek small letter rho, U+03C1 ISOgrk3 --> + {"sigmaf", "962"}, // greek small letter final sigma,U+03C2 ISOgrk3 --> + {"sigma", "963"}, // greek small letter sigma,U+03C3 ISOgrk3 --> + {"tau", "964"}, // greek small letter tau, U+03C4 ISOgrk3 --> + {"upsilon", "965"}, // greek small letter upsilon,U+03C5 ISOgrk3 --> + {"phi", "966"}, // greek small letter phi, U+03C6 ISOgrk3 --> + {"chi", "967"}, // greek small letter chi, U+03C7 ISOgrk3 --> + {"psi", "968"}, // greek small letter psi, U+03C8 ISOgrk3 --> + {"omega", "969"}, // greek small letter omega,U+03C9 ISOgrk3 --> + {"thetasym", "977"}, // greek small letter theta symbol,U+03D1 NEW --> + {"upsih", "978"}, // greek upsilon with hook symbol,U+03D2 NEW --> + {"piv", "982"}, // greek pi symbol, U+03D6 ISOgrk3 --> + // + {"bull", "8226"}, // bullet = black small circle,U+2022 ISOpub --> + // + {"hellip", "8230"}, // horizontal ellipsis = three dot leader,U+2026 ISOpub --> + {"prime", "8242"}, // prime = minutes = feet, U+2032 ISOtech --> + {"Prime", "8243"}, // double prime = seconds = inches,U+2033 ISOtech --> + {"oline", "8254"}, // overline = spacing overscore,U+203E NEW --> + {"frasl", "8260"}, // fraction slash, U+2044 NEW --> + // + {"weierp", "8472"}, // script capital P = power set= Weierstrass p, U+2118 ISOamso --> + {"image", "8465"}, // blackletter capital I = imaginary part,U+2111 ISOamso --> + {"real", "8476"}, // blackletter capital R = real part symbol,U+211C ISOamso --> + {"trade", "8482"}, // trade mark sign, U+2122 ISOnum --> + {"alefsym", "8501"}, // alef symbol = first transfinite cardinal,U+2135 NEW --> + // + // + {"larr", "8592"}, // leftwards arrow, U+2190 ISOnum --> + {"uarr", "8593"}, // upwards arrow, U+2191 ISOnum--> + {"rarr", "8594"}, // rightwards arrow, U+2192 ISOnum --> + {"darr", "8595"}, // downwards arrow, U+2193 ISOnum --> + {"harr", "8596"}, // left right arrow, U+2194 ISOamsa --> + {"crarr", "8629"}, // downwards arrow with corner leftwards= carriage return, U+21B5 NEW --> + {"lArr", "8656"}, // leftwards double arrow, U+21D0 ISOtech --> + // + {"uArr", "8657"}, // upwards double arrow, U+21D1 ISOamsa --> + {"rArr", "8658"}, // rightwards double arrow,U+21D2 ISOtech --> + // + {"dArr", "8659"}, // downwards double arrow, U+21D3 ISOamsa --> + {"hArr", "8660"}, // left right double arrow,U+21D4 ISOamsa --> + // + {"forall", "8704"}, // for all, U+2200 ISOtech --> + {"part", "8706"}, // partial differential, U+2202 ISOtech --> + {"exist", "8707"}, // there exists, U+2203 ISOtech --> + {"empty", "8709"}, // empty set = null set = diameter,U+2205 ISOamso --> + {"nabla", "8711"}, // nabla = backward difference,U+2207 ISOtech --> + {"isin", "8712"}, // element of, U+2208 ISOtech --> + {"notin", "8713"}, // not an element of, U+2209 ISOtech --> + {"ni", "8715"}, // contains as member, U+220B ISOtech --> + // + {"prod", "8719"}, // n-ary product = product sign,U+220F ISOamsb --> + // + {"sum", "8721"}, // n-ary summation, U+2211 ISOamsb --> + // + {"minus", "8722"}, // minus sign, U+2212 ISOtech --> + {"lowast", "8727"}, // asterisk operator, U+2217 ISOtech --> + {"radic", "8730"}, // square root = radical sign,U+221A ISOtech --> + {"prop", "8733"}, // proportional to, U+221D ISOtech --> + {"infin", "8734"}, // infinity, U+221E ISOtech --> + {"ang", "8736"}, // angle, U+2220 ISOamso --> + {"and", "8743"}, // logical and = wedge, U+2227 ISOtech --> + {"or", "8744"}, // logical or = vee, U+2228 ISOtech --> + {"cap", "8745"}, // intersection = cap, U+2229 ISOtech --> + {"cup", "8746"}, // union = cup, U+222A ISOtech --> + {"int", "8747"}, // integral, U+222B ISOtech --> + {"there4", "8756"}, // therefore, U+2234 ISOtech --> + {"sim", "8764"}, // tilde operator = varies with = similar to,U+223C ISOtech --> + // + {"cong", "8773"}, // approximately equal to, U+2245 ISOtech --> + {"asymp", "8776"}, // almost equal to = asymptotic to,U+2248 ISOamsr --> + {"ne", "8800"}, // not equal to, U+2260 ISOtech --> + {"equiv", "8801"}, // identical to, U+2261 ISOtech --> + {"le", "8804"}, // less-than or equal to, U+2264 ISOtech --> + {"ge", "8805"}, // greater-than or equal to,U+2265 ISOtech --> + {"sub", "8834"}, // subset of, U+2282 ISOtech --> + {"sup", "8835"}, // superset of, U+2283 ISOtech --> + // + {"sube", "8838"}, // subset of or equal to, U+2286 ISOtech --> + {"supe", "8839"}, // superset of or equal to,U+2287 ISOtech --> + {"oplus", "8853"}, // circled plus = direct sum,U+2295 ISOamsb --> + {"otimes", "8855"}, // circled times = vector product,U+2297 ISOamsb --> + {"perp", "8869"}, // up tack = orthogonal to = perpendicular,U+22A5 ISOtech --> + {"sdot", "8901"}, // dot operator, U+22C5 ISOamsb --> + // + // + {"lceil", "8968"}, // left ceiling = apl upstile,U+2308 ISOamsc --> + {"rceil", "8969"}, // right ceiling, U+2309 ISOamsc --> + {"lfloor", "8970"}, // left floor = apl downstile,U+230A ISOamsc --> + {"rfloor", "8971"}, // right floor, U+230B ISOamsc --> + {"lang", "9001"}, // left-pointing angle bracket = bra,U+2329 ISOtech --> + // + {"rang", "9002"}, // right-pointing angle bracket = ket,U+232A ISOtech --> + // + // + {"loz", "9674"}, // lozenge, U+25CA ISOpub --> + // + {"spades", "9824"}, // black spade suit, U+2660 ISOpub --> + // + {"clubs", "9827"}, // black club suit = shamrock,U+2663 ISOpub --> + {"hearts", "9829"}, // black heart suit = valentine,U+2665 ISOpub --> + {"diams", "9830"}, // black diamond suit, U+2666 ISOpub --> + + // + {"OElig", "338"}, // -- latin capital ligature OE,U+0152 ISOlat2 --> + {"oelig", "339"}, // -- latin small ligature oe, U+0153 ISOlat2 --> + // + {"Scaron", "352"}, // -- latin capital letter S with caron,U+0160 ISOlat2 --> + {"scaron", "353"}, // -- latin small letter s with caron,U+0161 ISOlat2 --> + {"Yuml", "376"}, // -- latin capital letter Y with diaeresis,U+0178 ISOlat2 --> + // + {"circ", "710"}, // -- modifier letter circumflex accent,U+02C6 ISOpub --> + {"tilde", "732"}, // small tilde, U+02DC ISOdia --> + // + {"ensp", "8194"}, // en space, U+2002 ISOpub --> + {"emsp", "8195"}, // em space, U+2003 ISOpub --> + {"thinsp", "8201"}, // thin space, U+2009 ISOpub --> + {"zwnj", "8204"}, // zero width non-joiner,U+200C NEW RFC 2070 --> + {"zwj", "8205"}, // zero width joiner, U+200D NEW RFC 2070 --> + {"lrm", "8206"}, // left-to-right mark, U+200E NEW RFC 2070 --> + {"rlm", "8207"}, // right-to-left mark, U+200F NEW RFC 2070 --> + {"ndash", "8211"}, // en dash, U+2013 ISOpub --> + {"mdash", "8212"}, // em dash, U+2014 ISOpub --> + {"lsquo", "8216"}, // left single quotation mark,U+2018 ISOnum --> + {"rsquo", "8217"}, // right single quotation mark,U+2019 ISOnum --> + {"sbquo", "8218"}, // single low-9 quotation mark, U+201A NEW --> + {"ldquo", "8220"}, // left double quotation mark,U+201C ISOnum --> + {"rdquo", "8221"}, // right double quotation mark,U+201D ISOnum --> + {"bdquo", "8222"}, // double low-9 quotation mark, U+201E NEW --> + {"dagger", "8224"}, // dagger, U+2020 ISOpub --> + {"Dagger", "8225"}, // double dagger, U+2021 ISOpub --> + {"permil", "8240"}, // per mille sign, U+2030 ISOtech --> + {"lsaquo", "8249"}, // single left-pointing angle quotation mark,U+2039 ISO proposed --> + // + {"rsaquo", "8250"}, // single right-pointing angle quotation mark,U+203A ISO proposed --> + // + {"euro", "8364"}, // -- euro sign, U+20AC NEW --> + }; + + /** + *+ * The set of entities supported by standard XML. + *
+ */ + public static final Entities XML; + + /** + *+ * The set of entities supported by HTML 3.2. + *
+ */ + public static final Entities HTML32; + + /** + *+ * The set of entities supported by HTML 4.0. + *
+ */ + public static final Entities HTML40; + + static { + XML = new Entities(); + XML.addEntities(BASIC_ARRAY); + XML.addEntities(APOS_ARRAY); + } + + static { + HTML32 = new Entities(); + HTML32.addEntities(BASIC_ARRAY); + HTML32.addEntities(ISO8859_1_ARRAY); + } + + static { + HTML40 = new Entities(); + fillWithHtml40Entities(HTML40); + } + + /** + *+ * Fills the specified entities instance with HTML 40 entities. + *
+ * + * @param entities + * the instance to be filled. + */ + static void fillWithHtml40Entities(Entities entities) { + entities.addEntities(BASIC_ARRAY); + entities.addEntities(ISO8859_1_ARRAY); + entities.addEntities(HTML40_ARRAY); + } + + static interface EntityMap { + /** + *+ * Add an entry to this entity map. + *
+ * + * @param name + * the entity name + * @param value + * the entity value + */ + void add(String name, int value); + + /** + *+ * Returns the name of the entity identified by the specified value. + *
+ * + * @param value + * the value to locate + * @return entity name associated with the specified value + */ + String name(int value); + + /** + *+ * Returns the value of the entity identified by the specified name. + *
+ * + * @param name + * the name to locate + * @return entity value associated with the specified name + */ + int value(String name); + } + + static class PrimitiveEntityMap implements EntityMap { + private Map mapNameToValue = new HashMap(); + + private IntHashMap mapValueToName = new IntHashMap(); + + /** + * {@inheritDoc} + */ + public void add(String name, int value) { + mapNameToValue.put(name, new Integer(value)); + mapValueToName.put(value, name); + } + + /** + * {@inheritDoc} + */ + public String name(int value) { + return (String) mapValueToName.get(value); + } + + /** + * {@inheritDoc} + */ + public int value(String name) { + Object value = mapNameToValue.get(name); + if (value == null) { + return -1; + } + return ((Integer) value).intValue(); + } + } + + static abstract class MapIntMap implements Entities.EntityMap { + protected Map mapNameToValue; + + protected Map mapValueToName; + + /** + * {@inheritDoc} + */ + public void add(String name, int value) { + mapNameToValue.put(name, new Integer(value)); + mapValueToName.put(new Integer(value), name); + } + + /** + * {@inheritDoc} + */ + public String name(int value) { + return (String) mapValueToName.get(new Integer(value)); + } + + /** + * {@inheritDoc} + */ + public int value(String name) { + Object value = mapNameToValue.get(name); + if (value == null) { + return -1; + } + return ((Integer) value).intValue(); + } + } + + static class HashEntityMap extends MapIntMap { + /** + * Constructs a new instance ofHashEntityMap.
+ */
+ public HashEntityMap() {
+ mapNameToValue = new HashMap();
+ mapValueToName = new HashMap();
+ }
+ }
+
+ static class TreeEntityMap extends MapIntMap {
+ /**
+ * Constructs a new instance of TreeEntityMap.
+ */
+ public TreeEntityMap() {
+ mapNameToValue = new TreeMap();
+ mapValueToName = new TreeMap();
+ }
+ }
+
+ static class LookupEntityMap extends PrimitiveEntityMap {
+ private String[] lookupTable;
+
+ private int LOOKUP_TABLE_SIZE = 256;
+
+ /**
+ * {@inheritDoc}
+ */
+ public String name(int value) {
+ if (value < LOOKUP_TABLE_SIZE) {
+ return lookupTable()[value];
+ }
+ return super.name(value);
+ }
+
+ /**
+ * + * Returns the lookup table for this entity map. The lookup table is created if it has not been previously. + *
+ * + * @return the lookup table + */ + private String[] lookupTable() { + if (lookupTable == null) { + createLookupTable(); + } + return lookupTable; + } + + /** + *+ * Creates an entity lookup table of LOOKUP_TABLE_SIZE elements, initialized with entity names. + *
+ */ + private void createLookupTable() { + lookupTable = new String[LOOKUP_TABLE_SIZE]; + for (int i = 0; i < LOOKUP_TABLE_SIZE; ++i) { + lookupTable[i] = super.name(i); + } + } + } + + static class ArrayEntityMap implements EntityMap { + protected int growBy = 100; + + protected int size = 0; + + protected String[] names; + + protected int[] values; + + /** + * Constructs a new instance ofArrayEntityMap.
+ */
+ public ArrayEntityMap() {
+ names = new String[growBy];
+ values = new int[growBy];
+ }
+
+ /**
+ * Constructs a new instance of ArrayEntityMap specifying the size by which the array should
+ * grow.
+ *
+ * @param growBy
+ * array will be initialized to and will grow by this amount
+ */
+ public ArrayEntityMap(int growBy) {
+ this.growBy = growBy;
+ names = new String[growBy];
+ values = new int[growBy];
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public void add(String name, int value) {
+ ensureCapacity(size + 1);
+ names[size] = name;
+ values[size] = value;
+ size++;
+ }
+
+ /**
+ * Verifies the capacity of the entity array, adjusting the size if necessary.
+ *
+ * @param capacity
+ * size the array should be
+ */
+ protected void ensureCapacity(int capacity) {
+ if (capacity > names.length) {
+ int newSize = Math.max(capacity, size + growBy);
+ String[] newNames = new String[newSize];
+ System.arraycopy(names, 0, newNames, 0, size);
+ names = newNames;
+ int[] newValues = new int[newSize];
+ System.arraycopy(values, 0, newValues, 0, size);
+ values = newValues;
+ }
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public String name(int value) {
+ for (int i = 0; i < size; ++i) {
+ if (values[i] == value) {
+ return names[i];
+ }
+ }
+ return null;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public int value(String name) {
+ for (int i = 0; i < size; ++i) {
+ if (names[i].equals(name)) {
+ return values[i];
+ }
+ }
+ return -1;
+ }
+ }
+
+ static class BinaryEntityMap extends ArrayEntityMap {
+
+ /**
+ * Constructs a new instance of BinaryEntityMap.
+ */
+ public BinaryEntityMap() {
+ super();
+ }
+
+ /**
+ * Constructs a new instance of ArrayEntityMap specifying the size by which the underlying array
+ * should grow.
+ *
+ * @param growBy
+ * array will be initialized to and will grow by this amount
+ */
+ public BinaryEntityMap(int growBy) {
+ super(growBy);
+ }
+
+ /**
+ * Performs a binary search of the entity array for the specified key. This method is based on code in
+ * {@link java.util.Arrays}.
+ *
+ * @param key
+ * the key to be found
+ * @return the index of the entity array matching the specified key
+ */
+ private int binarySearch(int key) {
+ int low = 0;
+ int high = size - 1;
+
+ while (low <= high) {
+ int mid = (low + high) >> 1;
+ int midVal = values[mid];
+
+ if (midVal < key) {
+ low = mid + 1;
+ } else if (midVal > key) {
+ high = mid - 1;
+ } else {
+ return mid; // key found
+ }
+ }
+ return -(low + 1); // key not found.
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public void add(String name, int value) {
+ ensureCapacity(size + 1);
+ int insertAt = binarySearch(value);
+ if (insertAt > 0) {
+ return; // note: this means you can't insert the same value twice
+ }
+ insertAt = -(insertAt + 1); // binarySearch returns it negative and off-by-one
+ System.arraycopy(values, insertAt, values, insertAt + 1, size - insertAt);
+ values[insertAt] = value;
+ System.arraycopy(names, insertAt, names, insertAt + 1, size - insertAt);
+ names[insertAt] = name;
+ size++;
+ }
+
+ /**
+ * {@inheritDoc}
+ */
+ public String name(int value) {
+ int index = binarySearch(value);
+ if (index < 0) {
+ return null;
+ }
+ return names[index];
+ }
+ }
+
+ // package scoped for testing
+ EntityMap map = new Entities.LookupEntityMap();
+
+ /**
+ * + * Adds entities to this entity. + *
+ * + * @param entityArray + * array of entities to be added + */ + public void addEntities(String[][] entityArray) { + for (int i = 0; i < entityArray.length; ++i) { + addEntity(entityArray[i][0], Integer.parseInt(entityArray[i][1])); + } + } + + /** + *+ * Add an entity to this entity. + *
+ * + * @param name + * name of the entity + * @param value + * vale of the entity + */ + public void addEntity(String name, int value) { + map.add(name, value); + } + + /** + *+ * Returns the name of the entity identified by the specified value. + *
+ * + * @param value + * the value to locate + * @return entity name associated with the specified value + */ + public String entityName(int value) { + return map.name(value); + } + + /** + *+ * Returns the value of the entity identified by the specified name. + *
+ * + * @param name + * the name to locate + * @return entity value associated with the specified name + */ + public int entityValue(String name) { + return map.value(name); + } + + /** + *
+ * Escapes the characters in a String.
+ *
+ * For example, if you have called addEntity("foo", 0xA1), escape("\u00A1") will return + * "&foo;" + *
+ * + * @param str + * TheString to escape.
+ * @return A new escaped String.
+ */
+ public String escape(String str) {
+ StringWriter stringWriter = createStringWriter(str);
+ try {
+ this.escape(stringWriter, str);
+ } catch (IOException e) {
+ // This should never happen because ALL the StringWriter methods called by #escape(Writer, String) do not
+ // throw IOExceptions.
+ throw new RuntimeException(e);
+ }
+ return stringWriter.toString();
+ }
+
+ /**
+ *
+ * Escapes the characters in the String passed and writes the result to the Writer
+ * passed.
+ *
Writer to write the results of the escaping to. Assumed to be a non-null value.
+ * @param str
+ * The String to escape. Assumed to be a non-null value.
+ * @throws IOException
+ * when Writer passed throws the exception from calls to the {@link Writer#write(int)}
+ * methods.
+ *
+ * @see #escape(String)
+ * @see Writer
+ */
+ public void escape(Writer writer, String str) throws IOException {
+ int len = str.length();
+ for (int i = 0; i < len; i++) {
+ char c = str.charAt(i);
+ String entityName = this.entityName(c);
+ if (entityName == null) {
+ if (c > 0x7F) {
+ writer.write("");
+ writer.write(Integer.toString(c, 10));
+ writer.write(';');
+ } else {
+ writer.write(c);
+ }
+ } else {
+ writer.write('&');
+ writer.write(entityName);
+ writer.write(';');
+ }
+ }
+ }
+
+ /**
+ *
+ * Unescapes the entities in a String.
+ *
+ * For example, if you have called addEntity("foo", 0xA1), unescape("&foo;") will return + * "\u00A1" + *
+ * + * @param str + * TheString to escape.
+ * @return A new escaped String.
+ */
+ public String unescape(String str) {
+ int firstAmp = str.indexOf('&');
+ if (firstAmp < 0) {
+ return str;
+ } else {
+ StringWriter stringWriter = createStringWriter(str);
+ try {
+ this.doUnescape(stringWriter, str, firstAmp);
+ } catch (IOException e) {
+ // This should never happen because ALL the StringWriter methods called by #escape(Writer, String)
+ // do not throw IOExceptions.
+ throw new RuntimeException(e);
+ }
+ return stringWriter.toString();
+ }
+ }
+
+ /**
+ * Make the StringWriter 10% larger than the source String to avoid growing the writer
+ *
+ * @param str The source string
+ * @return A newly created StringWriter
+ */
+ private StringWriter createStringWriter(String str) {
+ return new StringWriter((int) (str.length() + (str.length() * 0.1)));
+ }
+
+ /**
+ *
+ * Unescapes the escaped entities in the String passed and writes the result to the
+ * Writer passed.
+ *
Writer to write the results to; assumed to be non-null.
+ * @param str
+ * The source String to unescape; assumed to be non-null.
+ * @throws IOException
+ * when Writer passed throws the exception from calls to the {@link Writer#write(int)}
+ * methods.
+ *
+ * @see #escape(String)
+ * @see Writer
+ */
+ public void unescape(Writer writer, String str) throws IOException {
+ int firstAmp = str.indexOf('&');
+ if (firstAmp < 0) {
+ writer.write(str);
+ return;
+ } else {
+ doUnescape(writer, str, firstAmp);
+ }
+ }
+
+ /**
+ * Underlying unescape method that allows the optimisation of not starting from the 0 index again.
+ *
+ * @param writer
+ * The Writer to write the results to; assumed to be non-null.
+ * @param str
+ * The source String to unescape; assumed to be non-null.
+ * @param firstAmp
+ * The int index of the first ampersand in the source String.
+ * @throws IOException
+ * when Writer passed throws the exception from calls to the {@link Writer#write(int)}
+ * methods.
+ */
+ private void doUnescape(Writer writer, String str, int firstAmp) throws IOException {
+ writer.write(str, 0, firstAmp);
+ int len = str.length();
+ for (int i = firstAmp; i < len; i++) {
+ char c = str.charAt(i);
+ if (c == '&') {
+ int nextIdx = i + 1;
+ int semiColonIdx = str.indexOf(';', nextIdx);
+ if (semiColonIdx == -1) {
+ writer.write(c);
+ continue;
+ }
+ int amphersandIdx = str.indexOf('&', i + 1);
+ if (amphersandIdx != -1 && amphersandIdx < semiColonIdx) {
+ // Then the text looks like &...&...;
+ writer.write(c);
+ continue;
+ }
+ String entityContent = str.substring(nextIdx, semiColonIdx);
+ int entityValue = -1;
+ int entityContentLen = entityContent.length();
+ if (entityContentLen > 0) {
+ if (entityContent.charAt(0) == '#') { // escaped value content is an integer (decimal or
+ // hexidecimal)
+ if (entityContentLen > 1) {
+ char isHexChar = entityContent.charAt(1);
+ try {
+ switch (isHexChar) {
+ case 'X' :
+ case 'x' : {
+ entityValue = Integer.parseInt(entityContent.substring(2), 16);
+ break;
+ }
+ default : {
+ entityValue = Integer.parseInt(entityContent.substring(1), 10);
+ }
+ }
+ if (entityValue > 0xFFFF) {
+ entityValue = -1;
+ }
+ } catch (NumberFormatException e) {
+ entityValue = -1;
+ }
+ }
+ } else { // escaped value content is an entity name
+ entityValue = this.entityValue(entityContent);
+ }
+ }
+
+ if (entityValue == -1) {
+ writer.write('&');
+ writer.write(entityContent);
+ writer.write(';');
+ } else {
+ writer.write(entityValue);
+ }
+ i = semiColonIdx; // move index up to the semi-colon
+ } else {
+ writer.write(c);
+ }
+ }
+ }
+
+}
Index: ext/hpricot_scan/hpricot_scan.java.rl
===================================================================
--- ext/hpricot_scan/hpricot_scan.java.rl (revision 161)
+++ ext/hpricot_scan/hpricot_scan.java.rl (working copy)
@@ -6,7 +6,9 @@
import org.jruby.RubyHash;
import org.jruby.RubyModule;
import org.jruby.RubyNumeric;
+import org.jruby.RubyObjectAdapter;
import org.jruby.RubyString;
+import org.jruby.javasupport.JavaEmbedUtils;
import org.jruby.runtime.Block;
import org.jruby.runtime.CallbackFactory;
import org.jruby.runtime.builtin.IRubyObject;
@@ -15,6 +17,7 @@
public class HpricotScanService implements BasicLibraryService {
public static String NO_WAY_SERIOUSLY="*** This should not happen, please send a bug report with the HTML you're parsing to why@whytheluckystiff.net. So sorry!";
+ private static RubyObjectAdapter rubyApi;
public void ELE(IRubyObject N) {
if (tokend > tokstart || text) {
@@ -239,8 +242,8 @@
}
buffer_size = BUFSIZE;
- if (recv.getInstanceVariable("@buffer_size") != null) {
- bufsize = recv.getInstanceVariable("@buffer_size");
+ if (rubyApi.getInstanceVariable(recv, "@buffer_size") != null) {
+ bufsize = rubyApi.getInstanceVariable(recv, "@buffer_size");
if (!bufsize.isNil()) {
buffer_size = RubyNumeric.fix2int(bufsize);
}
@@ -359,5 +362,6 @@
CallbackFactory fact = runtime.callbackFactory(HpricotScanService.class);
mHpricot.getMetaClass().defineMethod("scan",fact.getSingletonMethod("__hpricot_scan",IRubyObject.class));
mHpricot.defineClassUnder("ParseError",runtime.getClass("Exception"),runtime.getClass("Exception").getAllocator());
+ rubyApi = JavaEmbedUtils.newObjectAdapter();
}
}
Index: Rakefile
===================================================================
--- Rakefile (revision 161)
+++ Rakefile (working copy)
@@ -12,16 +12,16 @@
REV = `svn info`[/Revision: (\d+)/, 1] rescue nil
VERS = ENV['VERSION'] || "0.6" + (REV ? ".#{REV}" : "")
PKG = "#{NAME}-#{VERS}"
-BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp}"
+BIN = "*.{bundle,jar,so,obj,pdb,lib,def,exp,class}"
ARCHLIB = "lib/#{::Config::CONFIG['arch']}"
-CLEAN.include ["ext/hpricot_scan/#{BIN}", "ext/fast_xs/#{BIN}", "lib/**/#{BIN}",
+CLEAN.include ["ext/hpricot_scan/#{BIN}", "ext/fast_xs/#{BIN}", "lib/**/#{BIN}", ARCHLIB,
'ext/fast_xs/Makefile', 'ext/hpricot_scan/Makefile',
- '**/.*.sw?', '*.gem', '.config']
+ '**/.*.sw?', '*.gem', '.config', 'pkg']
RDOC_OPTS = ['--quiet', '--title', 'The Hpricot Reference', '--main', 'README', '--inline-source']
PKG_FILES = %w(CHANGELOG COPYING README Rakefile) +
Dir.glob("{bin,doc,test,lib,extras}/**/*") +
Dir.glob("ext/**/*.{h,java,c,rb,rl}") +
- %w[ext/hpricot_scan/hpricot_scan.c] # needed because it's generated later
+ %w[ext/hpricot_scan/hpricot_scan.c ext/hpricot_scan/HpricotScanService.java] # needed because it's generated later
SPEC =
Gem::Specification.new do |s|
s.name = NAME
@@ -164,19 +164,44 @@
### JRuby Packages ###
-compile_java = proc do
- sh %{javac -source 1.4 -target 1.4 -classpath $JRUBY_HOME/lib/jruby.jar HpricotScanService.java}
- sh %{jar cf hpricot_scan.jar HpricotScanService.class}
+def java_classpath_arg
+ # A myriad of ways to discover the JRuby classpath
+ classpath = begin
+ require 'java'
+ # Already running in a JRuby JVM
+ Java::java.lang.System.getProperty('java.class.path')
+ rescue LoadError
+ ENV['JRUBY_PARENT_CLASSPATH'] || ENV['JRUBY_HOME'] && FileList["#{ENV['JRUBY_HOME']}/lib/*.jar"].join(File::PATH_SEPARATOR)
+ end
+ classpath ? "-cp #{classpath}" : ""
end
-desc "Compiles the JRuby extension"
+def compile_java(filename, jarname)
+ sh %{javac -source 1.4 -target 1.4 #{java_classpath_arg} #{filename}}
+ sh %{jar cf #{jarname} *.class}
+end
+
task :hpricot_scan_java => [:ragel_java] do
- Dir.chdir("ext/hpricot_scan", &compile_java)
+ Dir.chdir "ext/hpricot_scan" do
+ compile_java("HpricotScanService.java", "hpricot_scan.jar")
+ end
end
+task :fast_xs_java do
+ Dir.chdir "ext/fast_xs" do
+ compile_java("FastXsService.java", "fast_xs.jar")
+ end
+end
+
+desc "Compiles the JRuby extensions"
+task :hpricot_java => [:hpricot_scan_java, :fast_xs_java] do
+ mkdir_p "#{ARCHLIB}"
+ %w(hpricot_scan fast_xs).each {|ext| mv "ext/#{ext}/#{ext}.jar", "#{ARCHLIB}"}
+end
+
JRubySpec = SPEC.dup
JRubySpec.platform = 'jruby'
-JRubySpec.files = PKG_FILES + ["#{ARCHLIB}/hpricot_scan.jar"]
+JRubySpec.files = PKG_FILES + ["#{ARCHLIB}/hpricot_scan.jar", "#{ARCHLIB}/fast_xs.jar"]
JRubySpec.extensions = []
JRUBY_PKG_DIR = "#{PKG}-jruby"
@@ -187,15 +212,10 @@
mv PKG, JRUBY_PKG_DIR
end
-desc "Cross-compile the hpricot_scan extension for JRuby"
-file "hpricot_scan_jruby" => [JRUBY_PKG_DIR] do
- Dir.chdir("#{JRUBY_PKG_DIR}/ext/hpricot_scan", &compile_java)
- mv "#{JRUBY_PKG_DIR}/ext/hpricot_scan/hpricot_scan.jar", "#{JRUBY_PKG_DIR}/#{ARCHLIB}"
-end
-
desc "Build the RubyGems package for JRuby"
-task :package_jruby => ["hpricot_scan_jruby"] do
+task :package_jruby => JRUBY_PKG_DIR do
Dir.chdir("#{JRUBY_PKG_DIR}") do
+ Rake::Task[:hpricot_java].invoke
Gem::Builder.new(JRubySpec).build
verbose(true) {
mv Dir["*.gem"].first, "../pkg/#{JRUBY_PKG_DIR}.gem"