{"id":55825,"date":"2025-02-19T14:41:13","date_gmt":"2025-02-19T06:41:13","guid":{"rendered":"https:\/\/fwq.ai\/blog\/55825\/"},"modified":"2025-02-19T14:41:13","modified_gmt":"2025-02-19T06:41:13","slug":"50%e8%a1%8c%e6%ad%a3%e5%88%99%e8%a1%a8%e8%be%be%e5%bc%8f%e5%ae%9e%e7%8e%b0%e5%af%b9%e6%96%87%e6%a1%a3%e5%86%85%e5%a4%8d%e6%9d%82%e6%96%87%e6%9c%ac%e7%bb%93%e6%9e%84%e9%ab%98%e6%95%88%e5%88%86%e5%9d%97","status":"publish","type":"post","link":"https:\/\/fwq.ai\/blog\/55825\/","title":{"rendered":"50\u884c\u6b63\u5219\u8868\u8fbe\u5f0f\u5b9e\u73b0\u5bf9\u6587\u6863\u5185\u590d\u6742\u6587\u672c\u7ed3\u6784\u9ad8\u6548\u5206\u5757"},"content":{"rendered":"<p>Jina \u516c\u53f8\u7684 CEO \u8096\u6db5\u5728 GitHub \u4e0a\u5206\u4eab\u4e86\u4e00\u4e2a\u4ee4\u4eba\u5370\u8c61\u6df1\u523b\u7684\u4ee3\u7801\u7247\u6bb5\uff0c\u8fd9\u6bb5\u4ee3\u7801\u662f Jina tokenizer \u4e2d\u4f7f\u7528\u7684\u6838\u5fc3\u5206\u8bcd\u5b9e\u73b0\u3002\u8fd9\u4e2a\u6b63\u5219\u8868\u8fbe\u5f0f\u4ee3\u7801\u7247\u6bb5\u4ec5\u7528\u4e86 50 \u4f59\u884c\uff0c\u5374\u80fd\u591f\u9ad8\u6548\u5730\u5904\u7406\u5404\u79cd\u590d\u6742\u5ea6\u7684\u6587\u672c\u5185\u5bb9\u8fdb\u884c\u5206\u5757\u3002\u5176\u6027\u80fd\u4e4b\u5f3a\u52b2\u4ee4\u4eba\u60ca\u8bb6\u3002<\/p>\n<p>&nbsp;<\/p>\n<p> <br \/>\n<img decoding=\"async\" class=\"aligncenter\" src=\"\/\/www.w3.org\/2000\/svg'%20viewBox='0%200%20564%20361'%3E%3C\/svg%3E\" title=\"50\u884c\u6b63\u5219\u8868\u8fbe\u5f0f\u5b9e\u73b0\u5bf9\u6587\u6863\u5185\u590d\u6742\u6587\u672c\u7ed3\u6784\u9ad8\u6548\u5206\u5757\u63d2\u56fe\" alt=\"50\u884c\u6b63\u5219\u8868\u8fbe\u5f0f\u5b9e\u73b0\u5bf9\u6587\u6863\u5185\u590d\u6742\u6587\u672c\u7ed3\u6784\u9ad8\u6548\u5206\u5757\u63d2\u56fe\" \/><br \/>\n<img decoding=\"async\" class=\"aligncenter\" src=\"https:\/\/www.aisharenet.com\/wp-content\/uploads\/2024\/08\/0da447fbf7efa0c.png\" title=\"50\u884c\u6b63\u5219\u8868\u8fbe\u5f0f\u5b9e\u73b0\u5bf9\u6587\u6863\u5185\u590d\u6742\u6587\u672c\u7ed3\u6784\u9ad8\u6548\u5206\u5757\u63d2\u56fe1\" alt=\"50\u884c\u6b63\u5219\u8868\u8fbe\u5f0f\u5b9e\u73b0\u5bf9\u6587\u6863\u5185\u590d\u6742\u6587\u672c\u7ed3\u6784\u9ad8\u6548\u5206\u5757\u63d2\u56fe1\" \/> <\/p>\n<p>\u5728\u7ebf\u4f53\u9a8c\uff1ahttps:\/\/jina.ai\/tokenizer\/<\/p>\n<p>&nbsp;<\/p>\n<pre>\/\/ Updated: Aug. 15, 2024\r\n\/\/ Run: node testRegex.js testText.txt\r\n\/\/ Used in https:\/\/jina.ai\/tokenizer\r\nconst fs = require('fs');\r\nconst util = require('util');\r\n\n\/\/ Define variables for magic numbers\r\nconst MAX_HEADING_LENGTH = 7;\r\nconst MAX_HEADING_CONTENT_LENGTH = 200;\r\nconst MAX_HEADING_UNDERLINE_LENGTH = 200;\r\nconst MAX_HTML_HEADING_ATTRIBUTES_LENGTH = 100;\r\nconst MAX_LIST_ITEM_LENGTH = 200;\r\nconst MAX_NESTED_LIST_ITEMS = 6;\r\nconst MAX_LIST_INDENT_SPACES = 7;\r\nconst MAX_BLOCKQUOTE_LINE_LENGTH = 200;\r\nconst MAX_BLOCKQUOTE_LINES = 15;\r\nconst MAX_CODE_BLOCK_LENGTH = 1500;\r\nconst MAX_CODE_LANGUAGE_LENGTH = 20;\r\nconst MAX_INDENTED_CODE_LINES = 20;\r\nconst MAX_TABLE_CELL_LENGTH = 200;\r\nconst MAX_TABLE_ROWS = 20;\r\nconst MAX_HTML_TABLE_LENGTH = 2000;\r\nconst MIN_HORIZONTAL_RULE_LENGTH = 3;\r\nconst MAX_SENTENCE_LENGTH = 400;\r\nconst MAX_QUOTED_TEXT_LENGTH = 300;\r\nconst MAX_PARENTHETICAL_CONTENT_LENGTH = 200;\r\nconst MAX_NESTED_PARENTHESES = 5;\r\nconst MAX_MATH_INLINE_LENGTH = 100;\r\nconst MAX_MATH_BLOCK_LENGTH = 500;\r\nconst MAX_PARAGRAPH_LENGTH = 1000;\r\nconst MAX_STANDALONE_LINE_LENGTH = 800;\r\nconst MAX_HTML_TAG_ATTRIBUTES_LENGTH = 100;\r\nconst MAX_HTML_TAG_CONTENT_LENGTH = 1000;\r\nconst LOOKAHEAD_RANGE = 100;  \/\/ Number of characters to look ahead for a sentence boundary\r\n\n\/\/ Define the regex pattern\r\n\/\/ Headings\r\n\/\/ Citations\r\n\/\/ List items\r\n\/\/ Block quotes\r\n\/\/ Code blocks\r\n\/\/ Tables\r\n\/\/ Horizontal rules\r\n\/\/ Standalone lines or phrases\r\n\/\/ Sentences or phrases\r\n\/\/ Quoted text, parenthetical phrases, or bracketed content\r\n\/\/ Paragraphs\r\n\/\/ HTML-like tags and their content\r\n\/\/ LaTeX-style math expressions\r\n\/\/ Fallback for any remaining content\r\n\/\/ Read the regex and test text from files\r\n\nconst chunkRegex = new RegExp(\r\n    \"(\" +\r\n    \/\/ 1. Headings (Setext-style, Markdown, and HTML-style, with length constraints)\r\n    `(?:^(?:[#*=-]{1,${MAX_HEADING_LENGTH}}|\\\\w[^\\\\r\\\\n]{0,${MAX_HEADING_CONTENT_LENGTH}}\\\\r?\\\\n[-=]{2,${MAX_HEADING_UNDERLINE_LENGTH}}|&lt;h[1-6][^&gt;]{0,${MAX_HTML_HEADING_ATTRIBUTES_LENGTH}}&gt;)[^\\\\r\\\\n]{1,${MAX_HEADING_CONTENT_LENGTH}}(?:&lt;\/h[1-6]&gt;)?(?:\\\\r?\\\\n|$))` +\r\n    \"|\" +\r\n    \/\/ New pattern for citations\r\n    `(?:\\\\[[0-9]+\\\\][^\\\\r\\\\n]{1,${MAX_STANDALONE_LINE_LENGTH}})` +\r\n    \"|\" +\r\n    \/\/ 2. List items (bulleted, numbered, lettered, or task lists, including nested, up to three levels, with length constraints)\r\n    `(?:(?:^|\\\\r?\\\\n)[ \\\\t]{0,3}(?:[-*+\u2022]|\\\\d{1,3}\\\\.\\\\w\\\\.|\\\\[[ xX]\\\\])[ \\\\t]+(?:(?:\\\\b[^\\\\r\\\\n]{1,${MAX_LIST_ITEM_LENGTH}}\\\\b(?:[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))|(?:\\\\b[^\\\\r\\\\n]{1,${MAX_LIST_ITEM_LENGTH}}\\\\b(?=[\\\\r\\\\n]|$))|(?:\\\\b[^\\\\r\\\\n]{1,${MAX_LIST_ITEM_LENGTH}}\\\\b(?=[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?:.{1,${LOOKAHEAD_RANGE}}(?:[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))?))` +\r\n    `(?:(?:\\\\r?\\\\n[ \\\\t]{2,5}(?:[-*+\u2022]|\\\\d{1,3}\\\\.\\\\w\\\\.|\\\\[[ xX]\\\\])[ \\\\t]+(?:(?:\\\\b[^\\\\r\\\\n]{1,${MAX_LIST_ITEM_LENGTH}}\\\\b(?:[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))|(?:\\\\b[^\\\\r\\\\n]{1,${MAX_LIST_ITEM_LENGTH}}\\\\b(?=[\\\\r\\\\n]|$))|(?:\\\\b[^\\\\r\\\\n]{1,${MAX_LIST_ITEM_LENGTH}}\\\\b(?=[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?:.{1,${LOOKAHEAD_RANGE}}(?:[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))?)))` +\r\n    `{0,${MAX_NESTED_LIST_ITEMS}}(?:\\\\r?\\\\n[ \\\\t]{4,${MAX_LIST_INDENT_SPACES}}(?:[-*+\u2022]|\\\\d{1,3}\\\\.\\\\w\\\\.|\\\\[[ xX]\\\\])[ \\\\t]+(?:(?:\\\\b[^\\\\r\\\\n]{1,${MAX_LIST_ITEM_LENGTH}}\\\\b(?:[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))|(?:\\\\b[^\\\\r\\\\n]{1,${MAX_LIST_ITEM_LENGTH}}\\\\b(?=[\\\\r\\\\n]|$))|(?:\\\\b[^\\\\r\\\\n]{1,${MAX_LIST_ITEM_LENGTH}}\\\\b(?=[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?:.{1,${LOOKAHEAD_RANGE}}(?:[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))?)))` +\r\n    `{0,${MAX_NESTED_LIST_ITEMS}})?)` +\r\n    \"|\" +\r\n    \/\/ 3. Block quotes (including nested quotes and citations, up to three levels, with length constraints)\r\n    `(?:(?:^&gt;(?:&gt;|\\\\s{2,}){0,2}(?:(?:\\\\b[^\\\\r\\\\n]{0,${MAX_BLOCKQUOTE_LINE_LENGTH}}\\\\b(?:[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))|(?:\\\\b[^\\\\r\\\\n]{0,${MAX_BLOCKQUOTE_LINE_LENGTH}}\\\\b(?=[\\\\r\\\\n]|$))|(?:\\\\b[^\\\\r\\\\n]{0,${MAX_BLOCKQUOTE_LINE_LENGTH}}\\\\b(?=[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?:.{1,${LOOKAHEAD_RANGE}}(?:[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))?))\\\\r?\\\\n?){1,${MAX_BLOCKQUOTE_LINES}})` +\r\n    \"|\" +\r\n    \/\/ 4. Code blocks (fenced, indented, or HTML pre\/code tags, with length constraints)\r\n    `(?:(?:^|\\\\r?\\\\n)(?:\\`\\`\\`|~~~)(?:\\\\w{0,${MAX_CODE_LANGUAGE_LENGTH}})?\\\\r?\\\\n[\\\\s\\\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:\\`\\`\\`|~~~)\\\\r?\\\\n?` +\r\n    `|(?:(?:^|\\\\r?\\\\n)(?: {4}|\\\\t)[^\\\\r\\\\n]{0,${MAX_LIST_ITEM_LENGTH}}(?:\\\\r?\\\\n(?: {4}|\\\\t)[^\\\\r\\\\n]{0,${MAX_LIST_ITEM_LENGTH}}){0,${MAX_INDENTED_CODE_LINES}}\\\\r?\\\\n?)` +\r\n    `|(?:&lt;pre&gt;(?:&lt;code&gt;)?[\\\\s\\\\S]{0,${MAX_CODE_BLOCK_LENGTH}}?(?:&lt;\/code&gt;)?&lt;\/pre&gt;))` +\r\n    \"|\" +\r\n    \/\/ 5. Tables (Markdown, grid tables, and HTML tables, with length constraints)\r\n    `(?:(?:^|\\\\r?\\\\n)(?:\\\\|[^\\\\r\\\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\\\|(?:\\\\r?\\\\n\\\\|[-:]{1,${MAX_TABLE_CELL_LENGTH}}\\\\|){0,1}(?:\\\\r?\\\\n\\\\|[^\\\\r\\\\n]{0,${MAX_TABLE_CELL_LENGTH}}\\\\|){0,${MAX_TABLE_ROWS}}` +\r\n    `|&lt;table&gt;[\\\\s\\\\S]{0,${MAX_HTML_TABLE_LENGTH}}?&lt;\/table&gt;))` +\r\n    \"|\" +\r\n    \/\/ 6. Horizontal rules (Markdown and HTML hr tag)\r\n    `(?:^(?:[-*_]){${MIN_HORIZONTAL_RULE_LENGTH},}\\\\s*$|&lt;hr\\\\s*\/?&gt;)` +\r\n    \"|\" +\r\n    \/\/ 10. Standalone lines or phrases (including single-line blocks and HTML elements, with length constraints)\r\n    `(?:^(?:&lt;[a-zA-Z][^&gt;]{0,${MAX_HTML_TAG_ATTRIBUTES_LENGTH}}&gt;)?(?:(?:[^\\\\r\\\\n]{1,${MAX_STANDALONE_LINE_LENGTH}}(?:[.!?\u2026]|\\\\.\\\\.\\\\.|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))|(?:[^\\\\r\\\\n]{1,${MAX_STANDALONE_LINE_LENGTH}}(?=[\\\\r\\\\n]|$))|(?:[^\\\\r\\\\n]{1,${MAX_STANDALONE_LINE_LENGTH}}(?=[.!?\u2026]|\\\\.\\\\.\\\\.|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?:.{1,${LOOKAHEAD_RANGE}}(?:[.!?\u2026]|\\\\.\\\\.\\\\.|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))?))(?:&lt;\/[a-zA-Z]+&gt;)?(?:\\\\r?\\\\n|$))` +\r\n    \"|\" +\r\n    \/\/ 7. Sentences or phrases ending with punctuation (including ellipsis and Unicode punctuation)\r\n    `(?:(?:[^\\\\r\\\\n]{1,${MAX_SENTENCE_LENGTH}}(?:[.!?\u2026]|\\\\.\\\\.\\\\.|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))|(?:[^\\\\r\\\\n]{1,${MAX_SENTENCE_LENGTH}}(?=[\\\\r\\\\n]|$))|(?:[^\\\\r\\\\n]{1,${MAX_SENTENCE_LENGTH}}(?=[.!?\u2026]|\\\\.\\\\.\\\\.|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?:.{1,${LOOKAHEAD_RANGE}}(?:[.!?\u2026]|\\\\.\\\\.\\\\.|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))?))` +\r\n    \"|\" +\r\n    \/\/ 8. Quoted text, parenthetical phrases, or bracketed content (with length constraints)\r\n    \"(?:\" +\r\n    `(?&lt;!\\\\w)\\\"\\\"\\\"[^\\\"]{0,${MAX_QUOTED_TEXT_LENGTH}}\\\"\\\"\\\"(?!\\\\w)` +\r\n    `|(?&lt;!\\\\w)(?:['\\\"\\`'\"])[^\\\\r\\\\n]{0,${MAX_QUOTED_TEXT_LENGTH}}\\\\1(?!\\\\w)` +\r\n    `|\\\\([^\\\\r\\\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}(?:\\\\([^\\\\r\\\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}\\\\)[^\\\\r\\\\n()]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}){0,${MAX_NESTED_PARENTHESES}}\\\\)` +\r\n    `|\\\\[[^\\\\r\\\\n\\\\[\\\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}(?:\\\\[[^\\\\r\\\\n\\\\[\\\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}\\\\][^\\\\r\\\\n\\\\[\\\\]]{0,${MAX_PARENTHETICAL_CONTENT_LENGTH}}){0,${MAX_NESTED_PARENTHESES}}\\\\]` +\r\n    `|\\\\$[^\\\\r\\\\n$]{0,${MAX_MATH_INLINE_LENGTH}}\\\\$` +\r\n    `|\\`[^\\`\\\\r\\\\n]{0,${MAX_MATH_INLINE_LENGTH}}\\`` +\r\n    \")\" +\r\n    \"|\" +\r\n    \/\/ 9. Paragraphs (with length constraints)\r\n    `(?:(?:^|\\\\r?\\\\n\\\\r?\\\\n)(?:&lt;p&gt;)?(?:(?:[^\\\\r\\\\n]{1,${MAX_PARAGRAPH_LENGTH}}(?:[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))|(?:[^\\\\r\\\\n]{1,${MAX_PARAGRAPH_LENGTH}}(?=[\\\\r\\\\n]|$))|(?:[^\\\\r\\\\n]{1,${MAX_PARAGRAPH_LENGTH}}(?=[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?:.{1,${LOOKAHEAD_RANGE}}(?:[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))?))(?:&lt;\/p&gt;)?(?=\\\\r?\\\\n\\\\r?\\\\n|$))` +\r\n    \"|\" +\r\n    \/\/ 11. HTML-like tags and their content (including self-closing tags and attributes, with length constraints)\r\n    `(?:&lt;[a-zA-Z][^&gt;]{0,${MAX_HTML_TAG_ATTRIBUTES_LENGTH}}(?:&gt;[\\\\s\\\\S]{0,${MAX_HTML_TAG_CONTENT_LENGTH}}?&lt;\/[a-zA-Z]+&gt;|\\\\s*\/&gt;))` +\r\n    \"|\" +\r\n    \/\/ 12. LaTeX-style math expressions (inline and block, with length constraints)\r\n    `(?:(?:\\\\$\\\\$[\\\\s\\\\S]{0,${MAX_MATH_BLOCK_LENGTH}}?\\\\$\\\\$)|(?:\\\\$[^\\\\$\\\\r\\\\n]{0,${MAX_MATH_INLINE_LENGTH}}\\\\$))` +\r\n    \"|\" +\r\n    \/\/ 14. Fallback for any remaining content (with length constraints)\r\n    `(?:(?:[^\\\\r\\\\n]{1,${MAX_STANDALONE_LINE_LENGTH}}(?:[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))|(?:[^\\\\r\\\\n]{1,${MAX_STANDALONE_LINE_LENGTH}}(?=[\\\\r\\\\n]|$))|(?:[^\\\\r\\\\n]{1,${MAX_STANDALONE_LINE_LENGTH}}(?=[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?:.{1,${LOOKAHEAD_RANGE}}(?:[.!?\u2026]|\\\\.{3}|[\\\\u2026\\\\u2047-\\\\u2049]|[\\\\p{Emoji_Presentation}\\\\p{Extended_Pictographic}])(?=\\\\s|$))?))` +\r\n    \")\",\r\n    \"gmu\"\r\n);\r\n\n\/\/ read from the arg[1] file\r\nconst testText = fs.readFileSync(process.argv[2], 'utf8');\r\n\n\/\/ Function to format bytes to a human-readable string\r\nfunction formatBytes(bytes) {\r\n    if (bytes &lt; 1024) return bytes + \" bytes\";\r\n    else if (bytes &lt; 1048576) return (bytes \/ 1024).toFixed(2) + \" KB\";\r\n    else if (bytes &lt; 1073741824) return (bytes \/ 1048576).toFixed(2) + \" MB\";\r\n    else return (bytes \/ 1073741824).toFixed(2) + \" GB\";\r\n}\r\n\n\/\/ Start measuring time and memory\r\nconst startTime = process.hrtime();\r\nconst startMemory = process.memoryUsage().heapUsed;\r\n\n\/\/ Apply the regex\r\nconst matches = testText.match(chunkRegex);\r\n\n\/\/ End measuring time and memory\r\nconst endTime = process.hrtime(startTime);\r\nconst endMemory = process.memoryUsage().heapUsed;\r\n\n\/\/ Calculate execution time and memory usage\r\nconst executionTime = endTime[0] + endTime[1] \/ 1e9;\r\nconst memoryUsed = endMemory - startMemory;\r\n\n\/\/ Output results\r\nconsole.log(`Number of chunks: ${matches ? matches.length : 0}`);\r\nconsole.log(`Execution time: ${executionTime.toFixed(3)} seconds`);\r\nconsole.log(`Memory used: ${formatBytes(memoryUsed)}`);\r\n\n\/\/ Output the first 5 matches (or fewer if there are less than 5)\r\nconsole.log('\\nFirst 10 chunks:');\r\nif (matches) {\r\n    matches.slice(0, 100).forEach((match, index) =&gt; {\r\n        console.log(util.inspect(match, {maxStringLength: 50}));\r\n    });\r\n} else {\r\n    console.log('No chunks found.');\r\n}\r\n\n\/\/ Output regex flags\r\nconsole.log(`\\nRegex flags: ${chunkRegex.flags}`);\r\n\n\/\/ Check for potential issues\r\nif (executionTime &gt; 5) {\r\n    console.warn('\\nWarning: Execution time exceeded 5 seconds. The regex might be too complex or the input too large.');\r\n}\r\nif (memoryUsed &gt; 100 * 1024 * 1024) {\r\n    console.warn('\\nWarning: Memory usage exceeded 100 MB. Consider processing the input in smaller chunks.');\r\n}<\/pre>\n<p>&nbsp;<\/p>\n<p>\u8fd9\u6bb5\u4ee3\u7801\u7684\u6b63\u5219\u8868\u8fbe\u5f0f\u8003\u8651\u4e86\u591a\u79cd\u6587\u672c\u7ed3\u6784\uff0c\u5305\u62ec\u6807\u9898\u3001\u5217\u8868\u9879\u3001\u533a\u5757\u5f15\u7528\u3001\u4ee3\u7801\u5757\u3001\u8868\u683c\u3001\u6c34\u5e73\u89c4\u5219\u3001\u72ec\u7acb\u884c\u6216\u77ed\u8bed\u3001\u53e5\u5b50\u6216\u5e26\u6709\u6807\u70b9\u7684\u77ed\u8bed\u3001\u5f15\u7528\u6587\u672c\u3001\u5706\u62ec\u53f7\u5185\u5bb9\u3001\u4ee3\u7801\u5757\u3001\u8868\u683c\u3001\u6c34\u5e73\u89c4\u5219\u3001\u72ec\u7acb\u884c\u6216\u77ed\u8bed\u3001HTML \u6807\u7b7e\u5185\u5bb9\u3001LaTeX \u6570\u5b66\u8868\u8fbe\u5f0f\u7b49\u3002\u5b83\u901a\u8fc7\u7cbe\u5fc3\u8bbe\u8ba1\u7684\u6a21\u5f0f\u6765\u8fd1\u4f3c\u6587\u672c\u5206\u5757\uff0c\u5c3d\u7ba1\u6b63\u5219\u8868\u8fbe\u5f0f\u672c\u8eab\u5e76\u4e0d\u7406\u89e3\u6587\u672c\u7684\u4e0a\u4e0b\u6587\u6216\u8bed\u4e49\u3002<\/p>\n<p>\u4ee3\u7801\u793a\u4f8b\u4e2d\u7684\u6b63\u5219\u8868\u8fbe\u5f0f\u4f7f\u7528\u4e86\u201c\u56de\u6eaf\u201d\u529f\u80fd\uff0c\u8fd9\u5bf9\u4e8e\u5b9e\u73b0\u66f4\u6709\u610f\u4e49\u7684\u8bed\u4e49\u5206\u5272\u81f3\u5173\u91cd\u8981\u3002\u4f8b\u5982\uff0c\u5b83\u4e0d\u4f1a\u5728\u53e5\u5b50\u4e2d\u95f4\u8fdb\u884c\u65ad\u5f00\u3002\u7136\u800c\uff0c\u5bf9\u4e8e\u6df1\u5ea6\u5d4c\u5957\u7684\u5217\u8868\u3001\u533a\u5757\u5f15\u7528\u6216\u62ec\u53f7\u7b49\u7ed3\u6784\uff0c\u56de\u6eaf\u53ef\u80fd\u4f1a\u9047\u5230\u56f0\u96be\u3002\u4e3a\u4e86\u4f18\u5316\u8fd9\u4e9b\u60c5\u51b5\uff0c\u53ef\u4ee5\u8fdb\u4e00\u6b65\u6539\u8fdb\u6b63\u5219\u8868\u8fbe\u5f0f\uff0c\u4ee5\u66f4\u597d\u5730\u5904\u7406\u591a\u7ea7\u5d4c\u5957\uff0c\u5e76\u5c06\u5d4c\u5957\u9650\u5236\u5728\u5b9e\u7528\u6c34\u5e73\uff0c\u6bd4\u5982\u6700\u591a 3 \u7ea7\uff0c\u4ee5\u786e\u4fdd\u6027\u80fd\u5e76\u907f\u514d\u707e\u96be\u6027\u56de\u6eaf\u3002<\/p>\n<p>\u5c3d\u7ba1\u76ee\u524d\u8fd9\u6bb5\u4ee3\u7801\u53ef\u80fd\u8fd8\u4e0d\u662f\u5341\u5206\u5b8c\u5907\uff0c\u4f46\u6309\u7167\u8fd9\u4e2a\u601d\u8def\u4e0d\u65ad\u4f18\u5316\u7ec6\u8282\uff0c\u53ef\u4ee5\u9884\u89c1\u6548\u679c\u8fd8\u6709\u8fdb\u4e00\u6b65\u63d0\u5347\u7684\u7a7a\u95f4\u3002Jina \u5b98\u65b9\u63d0\u4f9b\u4e86\u4e91\u670d\u52a1\u5206\u8bcd\u5668\u63a5\u53e3\u4f9b\u5f00\u53d1\u8005\u4f53\u9a8c\u4f7f\u7528\uff0c\u5e76\u4e14\u662f\u514d\u8d39\u7684\u3002<\/p>\n<p>&nbsp;<\/p>\n<p><strong>python\u7248\u672c<\/strong> <\/p>\n<p><img decoding=\"async\" class=\"aligncenter\" align=\"right\" src=\"\/\/www.w3.org\/2000\/svg'%20viewBox='0%200%20150%20150'%3E%3C\/svg%3E\" style=\"width:150px;height:150px;margin-left:20px;border:none\" title=\"50\u884c\u6b63\u5219\u8868\u8fbe\u5f0f\u5b9e\u73b0\u5bf9\u6587\u6863\u5185\u590d\u6742\u6587\u672c\u7ed3\u6784\u9ad8\u6548\u5206\u5757\u63d2\u56fe2\" alt=\"50\u884c\u6b63\u5219\u8868\u8fbe\u5f0f\u5b9e\u73b0\u5bf9\u6587\u6863\u5185\u590d\u6742\u6587\u672c\u7ed3\u6784\u9ad8\u6548\u5206\u5757\u63d2\u56fe2\" \/><br \/>\n<img decoding=\"async\" class=\"aligncenter\" align=\"right\" src=\"https:\/\/www.aisharenet.com\/wp-content\/uploads\/2024\/07\/99dd797026b75ad.jpg\" style=\"width:150px;height:150px;margin-left:20px;border:none\" title=\"50\u884c\u6b63\u5219\u8868\u8fbe\u5f0f\u5b9e\u73b0\u5bf9\u6587\u6863\u5185\u590d\u6742\u6587\u672c\u7ed3\u6784\u9ad8\u6548\u5206\u5757\u63d2\u56fe3\" alt=\"50\u884c\u6b63\u5219\u8868\u8fbe\u5f0f\u5b9e\u73b0\u5bf9\u6587\u6863\u5185\u590d\u6742\u6587\u672c\u7ed3\u6784\u9ad8\u6548\u5206\u5757\u63d2\u56fe3\" \/><br \/>\n<span style=\"font-size:18px\">\u6b64\u5904\u5185\u5bb9\u5df2\u7ecf\u88ab\u4f5c\u8005\u9690\u85cf\uff0c\u8bf7\u8f93\u5165\u9a8c\u8bc1\u7801\u67e5\u770b\u5185\u5bb9<\/span><br \/>\n<span style=\"font-size:18px;float:left\">\u9a8c\u8bc1\u7801\uff1a<\/span><br \/>\n<span style=\"color:#00BF30\">\u8bf7\u5173\u6ce8\u672c\u7ad9\u5fae\u4fe1\u516c\u4f17\u53f7\uff0c\u56de\u590d\u201c<span style=\"color:blue\">\u9a8c\u8bc1\u7801<\/span>\u201d\uff0c\u83b7\u53d6\u9a8c\u8bc1\u7801\u3002\u5728\u5fae\u4fe1\u91cc\u641c\u7d22\u201c<span style=\"color:blue\">\u9996\u5e2dAI\u5206\u4eab\u5708<\/span>\u201d\u6216\u8005\u201c<span style=\"color:blue\">Looks-AI<\/span>\u201d\u6216\u8005\u5fae\u4fe1\u626b\u63cf\u53f3\u4fa7\u4e8c\u7ef4\u7801\u90fd\u53ef\u4ee5\u5173\u6ce8\u672c\u7ad9\u5fae\u4fe1\u516c\u4f17\u53f7\u3002<\/span> <\/p>\n","protected":false},"excerpt":{"rendered":"<p>Jina \u516c\u53f8\u7684 CEO \u8096\u6db5\u5728 GitHub \u4e0a\u5206\u4eab\u4e86\u4e00\u4e2a\u4ee4\u4eba\u5370\u8c61\u6df1\u523b\u7684\u4ee3\u7801\u7247\u6bb5\uff0c\u8fd9\u6bb5\u4ee3\u7801\u662f Jina tokenizer \u4e2d\u4f7f\u7528\u7684\u6838\u5fc3\u5206\u8bcd\u5b9e\u73b0\u3002\u8fd9\u4e2a\u6b63\u5219\u8868\u8fbe\u5f0f\u4ee3\u7801\u7247\u6bb5\u4ec5\u7528\u4e86 50 \u4f59\u884c\uff0c\u5374\u80fd\u591f\u9ad8\u6548\u5730\u5904\u7406\u5404\u79cd\u590d\u6742\u5ea6\u7684\u6587\u672c\u5185\u5bb9\u8fdb\u884c\u5206\u5757\u3002\u5176\u6027\u80fd\u4e4b\u5f3a\u52b2\u4ee4\u4eba\u60ca\u8bb6\u3002 &nbsp; \u5728\u7ebf\u4f53\u9a8c\uff1ahttps:\/\/jina.ai\/tokenizer\/ &nbsp; \/\/ Updated: Aug. 15, 2024 \/\/ Run: node testRegex.js testText.txt \/\/ Used in https:\/\/jina.ai\/tokenizer const fs = require(&#8216;fs&#8217;); const util = require(&#8216;util&#8217;); \/\/ Define variables for magic numbers const MAX_HEADING_LENGTH = 7; const MAX_HEADING_CONTENT_LENGTH = 200; const MAX_HEADING_UNDERLINE_LENGTH = 200; const [&hellip;]<\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"closed","ping_status":"","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[13],"tags":[],"class_list":["post-55825","post","type-post","status-publish","format-standard","hentry","category-ai"],"_links":{"self":[{"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/posts\/55825","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/comments?post=55825"}],"version-history":[{"count":0,"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/posts\/55825\/revisions"}],"wp:attachment":[{"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/media?parent=55825"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/categories?post=55825"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/fwq.ai\/blog\/wp-json\/wp\/v2\/tags?post=55825"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}