在数据处理场景中,我们常需筛选Excel中的图片URL。本文分享一个完整的Java方案,涵盖从读取图片URL到检测有效性、筛选大小,再到生成新Excel文件的全过程,同时讲解开发与优化过程,帮助你解决实际业务中的数据筛选和清洗需求。
客户现场图片数据,要求如下:
为实现上述目标,我们主要采用以下技术:
package cn.api.server;
import org.apache.poi.ss.usermodel.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.logging.Level;
import java.util.logging.Logger;
public class ImageSizeFilter {
private static final Logger LOGGER = Logger.getLogger(ImageSizeFilter.class.getName());
private static final int CONNECT_TIMEOUT = 5000;
private static final int READ_TIMEOUT = 5000;
private static final double BYTES_TO_MEGABYTES = 1024.0 * 1024.0;
private static final double SIZE_THRESHOLD = 1.0;
private static final SimpleDateFormat DATE_FORMAT = new SimpleDateFormat("yyyy-MM-dd");
public static void main(String[] args) {
String inputFilePath = "C:/Users/admin/Desktop/图片数据.xlsx";
String outputFilePath = "C:/Users/admin/Desktop/图片数据_筛选后.xlsx";
System.out.println("开始处理Excel文件...");
System.out.println("输入文件: " + inputFilePath);
long startTime = System.currentTimeMillis();
int processedCount = 0;
int filteredCount = 0;
try (FileInputStream inputStream = new FileInputStream(new File(inputFilePath));
Workbook workbook = new XSSFWorkbook(inputStream)) {
Sheet sheet = workbook.getSheetAt(0);
int totalRows = sheet.getLastRowNum();
System.out.println("发现 " + totalRows + " 条数据记录");
try (Workbook newWorkbook = new XSSFWorkbook()) {
Sheet newSheet = newWorkbook.createSheet();
Row headerRow = sheet.getRow(0);
Row newHeaderRow = newSheet.createRow(0);
// 复制表头并添加新列
copyRow(headerRow, newHeaderRow);
createHeaderCell(newHeaderRow, "图片大小(M)");
createHeaderCell(newHeaderRow, "状态");
int newRowIndex = 1;
for (int i = 1; i <= totalRows; i++) {
if (i % 100 == 0) {
System.out.println("已处理 " + i + "/" + totalRows + " 行");
}
Row row = sheet.getRow(i);
if (row != null) {
Cell urlCell = row.getCell(7);
if (urlCell != null) {
String imageUrl = getCellValue(urlCell);
if (isValidUrl(imageUrl)) {
processedCount++;
long sizeInBytes = getImageSize(imageUrl);
double sizeInMegabytes = sizeInBytes / BYTES_TO_MEGABYTES;
boolean is404 = sizeInBytes == 0 && isUrl404(imageUrl);
if (sizeInMegabytes > SIZE_THRESHOLD || is404) {
filteredCount++;
Row newRow = newSheet.createRow(newRowIndex++);
copyRowWithDateHandling(row, newRow, workbook, newWorkbook);
newRow.createCell(headerRow.getLastCellNum()).setCellValue(sizeInMegabytes);
newRow.createCell(headerRow.getLastCellNum() + 1).setCellValue(is404 ? "404" : "图片过大");
}
}
}
}
}
try (FileOutputStream outputStream = new FileOutputStream(new File(outputFilePath))) {
newWorkbook.write(outputStream);
}
long endTime = System.currentTimeMillis();
System.out.println("筛选完成!耗时:" + (endTime - startTime) / 1000 + " 秒");
System.out.println("处理记录数:" + processedCount);
System.out.println("筛选出的记录数:" + filteredCount);
System.out.println("结果保存至:" + outputFilePath);
}
} catch (IOException e) {
LOGGER.log(Level.SEVERE, "处理文件时出错", e);
}
}
}
在上述代码中,我们通过FileInputStream读取原始Excel文件,利用XSSFWorkbook将其加载为Workbook对象。然后获取第一个工作表(Sheet),并遍历其行数据。对于筛选出的符合条件的行,我们创建新的Workbook对象(newWorkbook),并在其中创建新的工作表(newSheet),将原始表头复制过来并添加新列 “图片大小(M)” 和 “状态”,用于存储图片大小信息和筛选状态。
// 获取图片大小(字节)
private static long getImageSize(String imageUrl) {
HttpURLConnection connection = null;
try {
URL url = new URL(imageUrl);
connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("HEAD");
connection.setConnectTimeout(CONNECT_TIMEOUT);
connection.setReadTimeout(READ_TIMEOUT);
connection.connect();
return connection.getResponseCode() == HttpURLConnection.HTTP_OK
? connection.getContentLength() : 0;
} catch (IOException e) {
LOGGER.log(Level.SEVERE, "获取图片大小异常", e);
return 0;
} finally {
if (connection != null) {
connection.disconnect();
}
}
}
// 判断URL是否404
private static boolean isUrl404(String imageUrl) {
HttpURLConnection connection = null;
try {
URL url = new URL(imageUrl);
connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("HEAD");
connection.setConnectTimeout(CONNECT_TIMEOUT);
connection.setReadTimeout(READ_TIMEOUT);
connection.connect();
return connection.getResponseCode() == HttpURLConnection.HTTP_NOT_FOUND;
} catch (IOException e) {
LOGGER.log(Level.SEVERE, "检测404异常", e);
return false;
} finally {
if (connection != null) {
connection.disconnect();
}
}
}
这里,我们使用HttpURLConnection发送HEAD请求到指定的图片URL。HEAD请求不会下载资源实体内容,只请求资源的头部信息,这样可以快速获取图片的相关信息,如大小等。通过调用connection.getContentLength()方法可获取图片大小(以字节为单位)。同时,我们还定义了isUrl404()方法来判断URL是否返回404状态码,以便识别无法访问的图片。
// 获取单元格值(处理日期格式)
private static String getCellValue(Cell cell) {
if (cell == null) {
return "";
}
int cellType = cell.getCellType();
switch (cellType) {
case Cell.CELL_TYPE_STRING:
return cell.getStringCellValue();
case Cell.CELL_TYPE_NUMERIC:
return DateUtil.isCellDateFormatted(cell)
? DATE_FORMAT.format(cell.getDateCellValue())
: String.valueOf(cell.getNumericCellValue());
case Cell.CELL_TYPE_BOOLEAN:
return String.valueOf(cell.getBooleanCellValue());
case Cell.CELL_TYPE_FORMULA:
return cell.getCellFormula();
default:
return "";
}
}
在读取Excel单元格数据时,需考虑不同类型的数据处理方式。对于字符串类型单元格,直接获取其字符串值;对于数值型单元格,若其为日期格式(通过DateUtil.isCellDateFormatted(cell)判断),则将其转换为Date对象并按照指定格式(yyyy - MM - dd)格式化为字符串,否则以常规数值形式返回;对于布尔型单元格,返回对应的布尔值字符串;对于公式型单元格,返回其公式内容。
// 复制行(表头专用,不处理日期)
private static void copyRow(Row sourceRow, Row targetRow) {
for (int i = 0; i < sourceRow.getLastCellNum(); i++) {
Cell sourceCell = sourceRow.getCell(i);
Cell targetCell = targetRow.createCell(i);
if (sourceCell != null) {
int cellType = sourceCell.getCellType();
switch (cellType) {
case Cell.CELL_TYPE_STRING:
targetCell.setCellValue(sourceCell.getStringCellValue());
break;
case Cell.CELL_TYPE_NUMERIC:
targetCell.setCellValue(sourceCell.getNumericCellValue());
break;
case Cell.CELL_TYPE_BOOLEAN:
targetCell.setCellValue(sourceCell.getBooleanCellValue());
break;
case Cell.CELL_TYPE_FORMULA:
targetCell.setCellFormula(sourceCell.getCellFormula());
break;
}
}
}
}
// 复制行(数据行专用,处理日期格式)
private static void copyRowWithDateHandling(Row sourceRow, Row targetRow,
Workbook sourceWorkbook, Workbook targetWorkbook) {
for (int i = 0; i < sourceRow.getLastCellNum(); i++) {
Cell sourceCell = sourceRow.getCell(i);
Cell targetCell = targetRow.createCell(i);
if (sourceCell != null) {
int cellType = sourceCell.getCellType();
switch (cellType) {
case Cell.CELL_TYPE_STRING:
targetCell.setCellValue(sourceCell.getStringCellValue());
break;
case Cell.CELL_TYPE_NUMERIC:
if (DateUtil.isCellDateFormatted(sourceCell)) {
targetCell.setCellValue(sourceCell.getDateCellValue());
CellStyle newCellStyle = targetWorkbook.createCellStyle();
newCellStyle.cloneStyleFrom(sourceCell.getCellStyle());
targetCell.setCellStyle(newCellStyle);
} else {
targetCell.setCellValue(sourceCell.getNumericCellValue());
}
break;
case Cell.CELL_TYPE_BOOLEAN:
targetCell.setCellValue(sourceCell.getBooleanCellValue());
break;
case Cell.CELL_TYPE_FORMULA:
targetCell.setCellFormula(sourceCell.getCellFormula());
break;
}
}
}
}
为实现行数据的复制,我们定义了两个方法。copyRow()方法用于复制表头行数据,直接根据单元格类型设置目标单元格的值;copyRowWithDateHandling()方法用于复制数据行,在处理数值型单元格时,会判断其是否为日期格式,若是,则将其作为日期处理并复制对应的单元格样式,以确保日期格式在新Excel文件中正确显示。
在最初的实现中,我们遇到了以下问题:
通过引入DateUtil.isCellDateFormatted()方法检测单元格是否为日期格式,并使用SimpleDateFormat将Date对象格式化为指定字符串形式,成功解决了日期格式处理问题。优化后的代码如下:
private static String getCellValue(Cell cell) {
if (cell == null) {
return "";
}
int cellType = cell.getCellType();
switch (cellType) {
case Cell.CELL_TYPE_STRING:
return cell.getStringCellValue();
case Cell.CELL_TYPE_NUMERIC:
return DateUtil.isCellDateFormatted(cell)
? DATE_FORMAT.format(cell.getDateCellValue())
: String.valueOf(cell.getNumericCellValue());
case Cell.CELL_TYPE_BOOLEAN:
return String.valueOf(cell.getBooleanCellValue());
case Cell.CELL_TYPE_FORMULA:
return cell.getCellFormula();
default:
return "";
}
}
将枚举和switch表达式改为传统写法,使用Cell.CELL_TYPE_* 常量,并重构行复制方法,使其在Java 8环境下稳定运行。改造后的行复制方法如下:
private static void copyRowWithDateHandling(Row sourceRow, Row targetRow,
Workbook sourceWorkbook, Workbook targetWorkbook) {
for (int i = 0; i < sourceRow.getLastCellNum(); i++) {
Cell sourceCell = sourceRow.getCell(i);
Cell targetCell = targetRow.createCell(i);
if (sourceCell != null) {
int cellType = sourceCell.getCellType();
switch (cellType) {
case Cell.CELL_TYPE_STRING:
targetCell.setCellValue(sourceCell.getStringCellValue());
break;
case Cell.CELL_TYPE_NUMERIC:
if (DateUtil.isCellDateFormatted(sourceCell)) {
targetCell.setCellValue(sourceCell.getDateCellValue());
CellStyle newCellStyle = targetWorkbook.createCellStyle();
newCellStyle.cloneStyleFrom(sourceCell.getCellStyle());
targetCell.setCellStyle(newCellStyle);
} else {
targetCell.setCellValue(sourceCell.getNumericCellValue());
}
break;
case Cell.CELL_TYPE_BOOLEAN:
targetCell.setCellValue(sourceCell.getBooleanCellValue());
break;
case Cell.CELL_TYPE_FORMULA:
targetCell.setCellFormula(sourceCell.getCellFormula());
break;
}
}
}
}
经过上述优化后,我们的代码实现了以下功能:
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
通过本案例,我们深刻体会到Java在数据处理领域的强大能力以及在实际业务开发中的广泛应用。借助Apache POI和HttpURLConnection等工具和技术,能够高效地实现Excel图片URL的筛选与大小检测功能,并解决实际业务中的数据清洗问题。在开发过程中,注重细节处理,如数据格式保留、跨版本兼容性等,是提升程序质量和用户体验的关键。未来,随着业务需求的不断拓展和技术的持续发展,我们可以对现有程序进行进一步优化和扩展,以满足更多样化的数据处理场景。