/* * 功能:解析qq词库文件(qpyd),返回存储词语的list */ import java.io.ByteArrayOutputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.RandomAccessFile; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.channels.Channels; import java.nio.channels.FileChannel; import java.util.Arrays; import java.util.zip.InflaterOutputStream; import java.util.List; import java.util.ArrayList; public class QQqpydReader { public static void main(String[] args) throws Exception { String qpydFile = "G:/各大输入法词库/QQ/单线程下载/QQ/城市地区/安徽/城市信息(安庆).qpyd"; List<String> wordList = new ArrayList<String>(); wordList = readQpydFile(qpydFile); for(int i=0;i<wordList.size();i++) { System.out.println(wordList.get(i)); } } /** * 读取qq词库文件(qpyd),返回一个包含所以词的list * @param inputPath : qpyd文件的路径 * @return: 包含词库文件中所有词的一个List<String> * @throws Exception */ public static List<String> readQpydFile(String inputPath) throws Exception { List<String> wordList = new ArrayList<String>(); // read qpyd into byte array ByteArrayOutputStream dataOut = new ByteArrayOutputStream(); FileChannel fChannel = new RandomAccessFile(inputPath, "r").getChannel(); fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut)); fChannel.close(); // qpyd as bytes ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray()); dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); // read info of compressed data int startZippedDictAddr = dataRawBytes.getInt(0x38); int zippedDictLength = dataRawBytes.limit() - startZippedDictAddr; // read zipped qqyd dictionary into byte array dataOut.reset(); Channels.newChannel(new InflaterOutputStream(dataOut)).write( ByteBuffer.wrap(dataRawBytes.array(), startZippedDictAddr, zippedDictLength)); // uncompressed qqyd dictionary as bytes ByteBuffer dataUnzippedBytes = ByteBuffer.wrap(dataOut.toByteArray()); dataUnzippedBytes.order(ByteOrder.LITTLE_ENDIAN); // for debugging: save unzipped data to *.unzipped file Channels.newChannel(new FileOutputStream(inputPath + ".unzipped")).write(dataUnzippedBytes); // stores the start address of actual dictionary data int unzippedDictStartAddr = -1; int idx = 0; byte[] byteArray = dataUnzippedBytes.array(); while (unzippedDictStartAddr == -1 || idx < unzippedDictStartAddr) { // read word int pinyinStartAddr = dataUnzippedBytes.getInt(idx + 0x6); int pinyinLength = dataUnzippedBytes.get(idx + 0x0) & 0xff; int wordStartAddr = pinyinStartAddr + pinyinLength; int wordLength = dataUnzippedBytes.get(idx + 0x1) & 0xff; if (unzippedDictStartAddr == -1) { unzippedDictStartAddr = pinyinStartAddr; } String word = new String(Arrays.copyOfRange(byteArray, wordStartAddr, wordStartAddr + wordLength), "UTF-16LE"); wordList.add(word); // step up idx += 0xa; } return wordList; } }