使用Node.js http 模块采集数据时的日文 EUC-JP 乱码，使用 iconv-lite 转码

项目需要采集日文网站，该日文网站编码是 EUC-JP。

原采集函数是：

// 下载指定页面 HTML
function getHtml(url) {
  var hp = http;
  if(url.substr(0,5) == "https"){
    hp = https;
  }
  return new Promise(function (resolve, reject) {
    hp.get(url, function (res) {
      var html = '';
      res.on('data', function (data) {
        html += data;
      });
      res.on('end', function () {
        resolve(html);
      });
    }).on('error', function () {
      console.log("下载指定页面 HTML 失败：", url);
      reject(false);
    });
  });
}

该函数实际返回的是一个utf-8格式的乱码字符串，解决办法是http模块不要获取字符串，要获取一个Buffer，然后再转换成ECU-JP格式的代码。

// 下载指定页面 HTML
function getHtml(url) {
  var hp = http;
  if(url.substr(0,5) == "https"){
    hp = https;
  }
  return new Promise(function (resolve, reject) {
    hp.get(url, function (res) {
      var html = '';
      res.setEncoding('binary');
      res.on('data', function (data) {
        html += data;
      });
      res.on('end', function () {
        resolve(html);
      });
    }).on('error', function () {
      console.log("下载指定页面 HTML 失败：", url);
      reject(false);
    });
  });
}

var pageHtml = await getHtml(url);
pageHtml = iconv.decode(pageHtml, "EUC-JP");

编码转换工具

使用的是 https://github.com/ashtuchkin/iconv-lite

基本使用方法：

var iconv = require("iconv-lite");

// Convert from an encoded buffer to a js string.
str = iconv.decode(Buffer.from([0x68, 0x65, 0x6c, 0x6c, 0x6f]), "win1251");

// Convert from a js string to an encoded buffer.
buf = iconv.encode("Sample input string", "win1251");

// Check if encoding is supported
iconv.encodingExists("us-ascii");

支持的编码：

All node.js native encodings: utf8, ucs2 / utf16-le, ascii, binary, base64, hex.

Additional unicode encodings: utf16, utf16-be, utf-7, utf-7-imap, utf32, utf32-le, and utf32-be.

All widespread singlebyte encodings: Windows 125x family, ISO-8859 family, IBM/DOS codepages, Macintosh family, KOI8 family, all others supported by iconv library. Aliases like 'latin1', 'us-ascii' also supported.

All widespread multibyte encodings: CP932, CP936, CP949, CP950, GB2312, GBK, GB18030, Big5, Shift_JIS, EUC-JP.

Node.js Buffer 设置编码

readable.setEncoding() 方法为从可读流读取的数据设置字符编码。

默认情况下没有设置字符编码，流数据返回的是 Buffer 对象。如果设置了字符编码，则流数据返回指定编码的字符串。例如，调用 readable.setEncoding('utf-8') 会将数据解析为 UTF-8 数据，并返回字符串，调用 readable.setEncoding('hex') 则会将数据编码成十六进制字符串。

可读流将会正确地处理通过流传递的多字节字符，否则如果简单地从流中作为 Buffer 对象拉出，则会被不正确地解码。

爬虫

修改时间 2024-05-25

声明：本站所有文章和图片，如无特殊说明，均为原创发布，转载请注明出处。